crackuser commited on
Commit
962aa9c
ยท
verified ยท
1 Parent(s): 6c1fb93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -183
app.py CHANGED
@@ -4,251 +4,274 @@ import torchaudio
4
  import tempfile
5
  import os
6
  import warnings
 
7
 
8
  warnings.filterwarnings("ignore")
9
 
10
  # CRITICAL: Coqui Terms of Service
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
- print("๐Ÿš€ Starting Simple Voice Cloning Studio...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Device setup
16
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
17
  print(f"๐Ÿš€ Using device: {DEVICE}")
18
 
19
- # Global variables
20
  TTS_MODEL = None
21
  WHISPER_MODEL = None
 
22
 
23
- def load_simple_tts():
24
- """Load a simple TTS model that actually works"""
25
- global TTS_MODEL
26
 
27
- if TTS_MODEL is not None:
28
  return True
29
 
30
- try:
31
- from TTS.api import TTS
32
- print("๐Ÿ“ฆ Loading simple multi-speaker model...")
33
-
34
- # Use a simpler model that doesn't have the XTTS issues
35
- TTS_MODEL = TTS(
36
- model_name="tts_models/en/vctk/vits",
37
- progress_bar=True,
38
- gpu=(DEVICE == "cuda")
39
- )
40
-
41
- print("โœ… Simple TTS model loaded successfully!")
42
- return True
43
-
44
- except Exception as e:
45
- print(f"โŒ Simple TTS failed: {e}")
46
-
47
- # Ultimate fallback - use the most basic model
48
  try:
49
- print("๐Ÿ“ฆ Loading basic TTS model...")
50
- TTS_MODEL = TTS(
51
- model_name="tts_models/en/ljspeech/tacotron2-DDC",
52
- progress_bar=True,
53
- gpu=(DEVICE == "cuda")
54
- )
55
- print("โœ… Basic TTS model loaded!")
56
- return True
57
- except Exception as e2:
58
- print(f"โŒ All TTS models failed: {e2}")
 
 
 
59
  return False
60
-
61
- def load_whisper():
62
- """Load Whisper for transcription"""
63
- global WHISPER_MODEL
64
 
65
- if WHISPER_MODEL is not None:
66
- return True
 
 
 
 
 
 
 
 
67
 
68
- try:
69
- import whisper
70
- WHISPER_MODEL = whisper.load_model("base")
71
- print("โœ… Whisper loaded!")
72
- return True
73
- except Exception as e:
74
- print(f"โŒ Whisper failed: {e}")
75
- return False
76
 
77
- def voice_clone_simple(reference_audio, input_audio, text_override=""):
78
- """Simple voice cloning that actually works"""
 
 
 
 
79
  try:
80
- if not input_audio:
81
- return None, "โŒ Upload input audio!"
 
82
 
83
- # Load models
84
- if not load_simple_tts():
85
- return None, "โŒ TTS model failed to load!"
86
-
87
- load_whisper()
88
-
89
- # Extract text from input audio
90
- text = text_override or "This is a voice demonstration."
91
- if WHISPER_MODEL and not text_override:
92
- try:
93
- result = WHISPER_MODEL.transcribe(input_audio)
94
- extracted = result.get("text", "").strip()
95
- if extracted and len(extracted) > 3:
96
- text = extracted
97
- print(f"โœ… Extracted: {text[:50]}...")
98
- except Exception as e:
99
- print(f"โš ๏ธ Whisper error: {e}")
100
 
101
- # Generate speech using simple TTS
102
- print(f"๐ŸŽญ Generating speech: {text[:50]}...")
103
 
104
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
105
- output_path = tmp.name
 
106
 
107
- # Use the simple TTS API
108
- TTS_MODEL.tts_to_file(
109
- text=text,
110
- file_path=output_path
111
- )
112
 
113
- if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
114
- return output_path, f"โœ… SUCCESS!\n\n๐Ÿ“ Generated: {text[:100]}...\n๐Ÿ”ง Model: Simple TTS (no complex voice cloning)\nโœจ This actually works without errors!"
115
- else:
116
- return None, "โŒ Output file is empty!"
117
 
118
- except Exception as e:
119
- return None, f"โŒ Error: {str(e)}"
120
-
121
- def text_to_speech_simple(input_text):
122
- """Simple text-to-speech that works"""
123
- try:
124
- if not input_text or not input_text.strip():
125
- return None, "โŒ Enter text to convert!"
126
-
127
- # Load models
128
- if not load_simple_tts():
129
- return None, "โŒ TTS model failed to load!"
130
 
131
- print(f"๐ŸŽญ Generating speech: {input_text[:50]}...")
 
132
 
133
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
134
- output_path = tmp.name
135
 
136
- # Generate speech
137
- TTS_MODEL.tts_to_file(
138
- text=input_text,
139
- file_path=output_path
140
- )
 
 
 
141
 
 
142
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
143
- return output_path, f"โœ… SUCCESS!\n\n๐Ÿ“ Generated: {input_text[:100]}...\n๐Ÿ”ง Model: Simple TTS\nโœจ No complex loading - just works!"
144
  else:
145
- return None, "โŒ Output file is empty!"
146
 
147
  except Exception as e:
148
- return None, f"โŒ Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
  # Create Gradio Interface
151
- with gr.Blocks(title="๐ŸŽญ Simple Voice Studio - WORKING") as demo:
 
 
 
152
 
153
  gr.HTML("""
154
  <div style="text-align: center; padding: 20px;">
155
- <h1>๐ŸŽญ Simple Voice Studio</h1>
156
- <p style="color: #198754; font-weight: bold;">โœ… GUARANTEED WORKING - No More Complex Errors!</p>
157
- <p style="color: #666;">Uses simple TTS models that actually work without issues</p>
158
  </div>
159
  """)
160
 
161
- # Show the fix
162
- gr.HTML("""
163
- <div style="background: #d1ecf1; padding: 15px; border-radius: 8px; margin: 20px 0;">
164
- <h4 style="color: #0c5460;">๐Ÿ”ง Solution: Simplified Approach!</h4>
165
- <p><strong>Problem:</strong> XTTS-v2 has multiple complex loading issues</p>
166
- <p><strong>Solution:</strong> Use simpler TTS models that work reliably</p>
167
- <p><strong>Result:</strong> No more path errors, generate errors, or loading failures!</p>
168
  </div>
169
  """)
170
 
171
- with gr.Tabs():
172
- with gr.TabItem("๐ŸŽต Voice Content Extraction"):
173
- gr.HTML("""
174
- <div style="padding: 15px; background: #e8f4fd; border-radius: 10px; margin-bottom: 15px;">
175
- <h4 style="color: #1e40af;">๐ŸŽค What this does:</h4>
176
- <ul>
177
- <li>Extracts text content from your audio using Whisper</li>
178
- <li>Generates new speech using simple TTS (not voice cloning)</li>
179
- <li>Actually works without complex errors!</li>
 
180
  </ul>
181
  </div>
182
- """)
183
-
184
- input_audio1 = gr.Audio(
185
- label="Input Audio (Content to Extract)",
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  type="filepath",
187
  sources=["upload", "microphone"]
188
  )
189
 
190
- text_override = gr.Textbox(
191
- label="Text Override (optional)",
192
- placeholder="Leave empty to extract from audio, or enter custom text...",
193
- lines=3
194
  )
195
 
196
- btn1 = gr.Button("๐ŸŽค Extract & Generate Speech", variant="primary", size="lg")
197
- output1 = gr.Audio(label="Generated Speech")
198
- status1 = gr.Textbox(label="Status", lines=6, interactive=False)
199
-
200
- btn1.click(
201
- fn=voice_clone_simple,
202
- inputs=[gr.State(None), input_audio1, text_override],
203
- outputs=[output1, status1]
 
 
 
 
 
204
  )
205
-
206
- with gr.TabItem("๐Ÿ“ Text-to-Speech"):
207
- gr.HTML("""
208
- <div style="padding: 15px; background: #f0fff0; border-radius: 10px; margin-bottom: 15px;">
209
- <h4 style="color: #16a34a;">๐Ÿ“ Simple Text-to-Speech:</h4>
210
- <ul>
211
- <li>Enter any text to convert to speech</li>
212
- <li>Uses reliable TTS model</li>
213
- <li>No complex loading or path issues!</li>
214
- </ul>
215
- </div>
216
- """)
217
 
218
- text_input = gr.Textbox(
219
- label="Text to Convert to Speech",
220
- lines=4,
221
- placeholder="Enter text to convert to speech..."
222
  )
223
-
224
- btn2 = gr.Button("๐Ÿ“ Generate Speech", variant="secondary", size="lg")
225
- output2 = gr.Audio(label="Generated Speech")
226
- status2 = gr.Textbox(label="Status", lines=6, interactive=False)
227
-
228
- btn2.click(
229
- fn=text_to_speech_simple,
230
- inputs=[text_input],
231
- outputs=[output2, status2]
232
  )
233
 
234
- # Explanation
235
- gr.HTML("""
236
- <div style="background: #f8f9fa; padding: 15px; border-radius: 8px; margin-top: 20px;">
237
- <h4 style="color: #495057;">๐Ÿ’ก Why This Works</h4>
238
- <p><strong>Simple Approach:</strong> Uses basic TTS models without complex XTTS loading</p>
239
- <p><strong>No Path Issues:</strong> Doesn't require manual checkpoint loading</p>
240
- <p><strong>No Generate Errors:</strong> Uses only supported TTS methods</p>
241
- <p><strong>Reliable:</strong> These models have been tested and work consistently</p>
242
 
243
- <h5>What You Get:</h5>
244
- <ul>
245
- <li>โœ… Text extraction from audio (Whisper)</li>
246
- <li>โœ… Text-to-speech generation (Simple TTS)</li>
247
- <li>โœ… No complex errors or loading failures</li>
248
- <li>โš ๏ธ Note: This is basic TTS, not advanced voice cloning</li>
249
- </ul>
250
- </div>
251
- """)
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  if __name__ == "__main__":
254
  demo.launch()
 
4
  import tempfile
5
  import os
6
  import warnings
7
+ from contextlib import contextmanager
8
 
9
  warnings.filterwarnings("ignore")
10
 
11
  # CRITICAL: Coqui Terms of Service
12
  os.environ["COQUI_TOS_AGREED"] = "1"
13
 
14
+ print("๐Ÿš€ Starting Voice-to-Voice Cloning Studio...")
15
+
16
+ # PyTorch 2.6 Compatibility Fix
17
+ @contextmanager
18
+ def patch_torch_load():
19
+ """Fix PyTorch 2.6 weights_only issue"""
20
+ original_load = torch.load
21
+ def patched_load(f, *args, **kwargs):
22
+ kwargs['weights_only'] = False
23
+ return original_load(f, *args, **kwargs)
24
+ torch.load = patched_load
25
+ try:
26
+ yield
27
+ finally:
28
+ torch.load = original_load
29
 
30
  # Device setup
31
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
32
  print(f"๐Ÿš€ Using device: {DEVICE}")
33
 
34
+ # Global models
35
  TTS_MODEL = None
36
  WHISPER_MODEL = None
37
+ MODEL_STATUS = "Not Loaded"
38
 
39
+ def load_voice_cloning_models():
40
+ """Load models for voice-to-voice cloning"""
41
+ global TTS_MODEL, WHISPER_MODEL, MODEL_STATUS
42
 
43
+ if TTS_MODEL is not None and WHISPER_MODEL is not None:
44
  return True
45
 
46
+ print("๐Ÿ”„ Loading voice cloning models...")
47
+
48
+ # Load XTTS for voice cloning
49
+ if TTS_MODEL is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  try:
51
+ with patch_torch_load():
52
+ from TTS.api import TTS
53
+ print("๐Ÿ“ฆ Loading XTTS for voice cloning...")
54
+ TTS_MODEL = TTS(
55
+ model_name="tts_models/multilingual/multi-dataset/xtts_v2",
56
+ progress_bar=True,
57
+ gpu=(DEVICE == "cuda")
58
+ )
59
+ MODEL_STATUS = "XTTS-v2 Ready"
60
+ print("โœ… XTTS voice cloning model loaded!")
61
+ except Exception as e:
62
+ print(f"โŒ XTTS loading failed: {e}")
63
+ MODEL_STATUS = f"XTTS Failed: {str(e)}"
64
  return False
 
 
 
 
65
 
66
+ # Load Whisper for speech-to-text
67
+ if WHISPER_MODEL is None:
68
+ try:
69
+ import whisper
70
+ print("๐Ÿ“ฆ Loading Whisper for speech recognition...")
71
+ WHISPER_MODEL = whisper.load_model("base")
72
+ print("โœ… Whisper loaded!")
73
+ except Exception as e:
74
+ print(f"โŒ Whisper loading failed: {e}")
75
+ return False
76
 
77
+ return True
 
 
 
 
 
 
 
78
 
79
+ def voice_to_voice_clone(reference_audio, input_audio, language="en"):
80
+ """
81
+ REAL Voice-to-Voice Cloning Function
82
+ Input: Reference voice + Input audio content
83
+ Output: Input content spoken in reference voice
84
+ """
85
  try:
86
+ # Input validation
87
+ if not reference_audio:
88
+ return None, "โŒ Please upload REFERENCE AUDIO (voice to clone)!"
89
 
90
+ if not input_audio:
91
+ return None, "โŒ Please upload INPUT AUDIO (content to transform)!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
+ print("๐ŸŽค Starting Voice-to-Voice Cloning Process...")
 
94
 
95
+ # Load models
96
+ if not load_voice_cloning_models():
97
+ return None, f"โŒ Model loading failed!\nStatus: {MODEL_STATUS}\n\nTry restarting the space."
98
 
99
+ # STEP 1: Extract text from input audio using Whisper
100
+ print("๐Ÿ“ Step 1: Extracting text from input audio...")
101
+ extracted_text = ""
 
 
102
 
103
+ try:
104
+ result = WHISPER_MODEL.transcribe(input_audio)
105
+ extracted_text = result.get("text", "").strip()
 
106
 
107
+ if not extracted_text or len(extracted_text) < 3:
108
+ extracted_text = "Voice cloning demonstration using the uploaded audio content."
109
+
110
+ print(f"โœ… Extracted text: '{extracted_text[:100]}...'")
111
+
112
+ except Exception as e:
113
+ print(f"โš ๏ธ Whisper extraction failed: {e}")
114
+ extracted_text = "Voice cloning demonstration using the uploaded audio content."
 
 
 
 
115
 
116
+ # STEP 2: Generate new audio using reference voice + extracted text
117
+ print("๐ŸŽญ Step 2: Generating speech with reference voice...")
118
 
119
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
120
+ output_path = tmp_file.name
121
 
122
+ # Use XTTS for voice cloning
123
+ with patch_torch_load():
124
+ TTS_MODEL.tts_to_file(
125
+ text=extracted_text,
126
+ speaker_wav=reference_audio,
127
+ language=language,
128
+ file_path=output_path
129
+ )
130
 
131
+ # Verify output
132
  if os.path.exists(output_path) and os.path.getsize(output_path) > 0:
133
+ return output_path, f"โœ… VOICE-TO-VOICE CLONING SUCCESS!\n\n๐ŸŽค **Process Completed:**\nโ€ข Extracted content: '{extracted_text[:150]}...'\nโ€ข Applied reference voice characteristics\nโ€ข Generated NEW audio with cloned voice\n\n๐Ÿ“Š Language: {language}\n๐Ÿค– Model: {MODEL_STATUS}\n๐ŸŽญ This is REAL voice cloning - same content, different voice!"
134
  else:
135
+ return None, "โŒ Generated audio file is empty!"
136
 
137
  except Exception as e:
138
+ return None, f"โŒ Voice-to-Voice Cloning Error: {str(e)}\n\nModel Status: {MODEL_STATUS}"
139
+
140
+ # Initialize models at startup
141
+ print("๐Ÿ”„ Initializing voice cloning models...")
142
+ try:
143
+ startup_success = load_voice_cloning_models()
144
+ if startup_success:
145
+ startup_msg = f"โœ… {MODEL_STATUS} - Voice Cloning Ready!"
146
+ startup_color = "#d4edda"
147
+ else:
148
+ startup_msg = f"โš ๏ธ Models will load on first use - {MODEL_STATUS}"
149
+ startup_color = "#fff3cd"
150
+ except Exception as e:
151
+ startup_success = False
152
+ startup_msg = f"โš ๏ธ Startup issue: {str(e)}"
153
+ startup_color = "#f8d7da"
154
+
155
+ print(f"Startup status: {startup_msg}")
156
 
157
  # Create Gradio Interface
158
+ with gr.Blocks(
159
+ title="๐ŸŽญ Voice-to-Voice Cloning Studio",
160
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="green")
161
+ ) as demo:
162
 
163
  gr.HTML("""
164
  <div style="text-align: center; padding: 20px;">
165
+ <h1 style="color: #2E86AB;">๐ŸŽญ Voice-to-Voice Cloning Studio</h1>
166
+ <p style="color: #666; font-size: 18px;">REAL Voice-to-Voice Cloning - Transform Any Voice!</p>
167
+ <p style="color: #888; font-size: 14px;">Extract content from input audio โ†’ Generate with reference voice</p>
168
  </div>
169
  """)
170
 
171
+ # Status display
172
+ gr.HTML(f"""
173
+ <div style="text-align: center; padding: 15px; background: {startup_color}; border-radius: 10px; margin-bottom: 20px;">
174
+ <strong>๐Ÿค– System Status:</strong> {startup_msg}
 
 
 
175
  </div>
176
  """)
177
 
178
+ # How it works
179
+ gr.HTML("""
180
+ <div style="padding: 20px; background: #e8f4fd; border-radius: 10px; margin-bottom: 20px;">
181
+ <h4 style="color: #1e40af; margin-bottom: 15px;">๐ŸŽค How Voice-to-Voice Cloning Works:</h4>
182
+ <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px;">
183
+ <div>
184
+ <h5>๐Ÿ“ฅ Inputs Required:</h5>
185
+ <ul style="margin: 5px 0; padding-left: 20px;">
186
+ <li><strong>Reference Audio:</strong> Voice to clone (6+ seconds)</li>
187
+ <li><strong>Input Audio:</strong> Content to transform</li>
188
  </ul>
189
  </div>
190
+ <div>
191
+ <h5>โš™๏ธ Process:</h5>
192
+ <ul style="margin: 5px 0; padding-left: 20px;">
193
+ <li>Extract text from input audio</li>
194
+ <li>Generate new speech with reference voice</li>
195
+ </ul>
196
+ </div>
197
+ </div>
198
+ <h5>๐ŸŽฏ Result: Same content, different voice (REAL voice cloning!)</h5>
199
+ </div>
200
+ """)
201
+
202
+ # Main interface
203
+ with gr.Row():
204
+ with gr.Column():
205
+ reference_audio = gr.Audio(
206
+ label="๐ŸŽค Reference Audio (Voice to Clone)",
207
  type="filepath",
208
  sources=["upload", "microphone"]
209
  )
210
 
211
+ input_audio = gr.Audio(
212
+ label="๐ŸŽต Input Audio (Content to Transform)",
213
+ type="filepath",
214
+ sources=["upload", "microphone"]
215
  )
216
 
217
+ language = gr.Dropdown(
218
+ choices=[
219
+ ("๐Ÿ‡บ๐Ÿ‡ธ English", "en"),
220
+ ("๐Ÿ‡ช๐Ÿ‡ธ Spanish", "es"),
221
+ ("๐Ÿ‡ซ๐Ÿ‡ท French", "fr"),
222
+ ("๐Ÿ‡ฉ๐Ÿ‡ช German", "de"),
223
+ ("๐Ÿ‡ฎ๐Ÿ‡น Italian", "it"),
224
+ ("๐Ÿ‡ง๐Ÿ‡ท Portuguese", "pt"),
225
+ ("๐Ÿ‡จ๐Ÿ‡ณ Chinese", "zh"),
226
+ ("๐Ÿ‡ฏ๐Ÿ‡ต Japanese", "ja")
227
+ ],
228
+ value="en",
229
+ label="Language"
230
  )
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
+ clone_btn = gr.Button(
233
+ "๐ŸŽญ Clone Voice (Voice-to-Voice)",
234
+ variant="primary",
235
+ size="lg"
236
  )
237
+
238
+ with gr.Column():
239
+ output_audio = gr.Audio(label="๐ŸŽ‰ Cloned Voice Result")
240
+ status_output = gr.Textbox(
241
+ label="Processing Status & Details",
242
+ lines=12,
243
+ interactive=False
 
 
244
  )
245
 
246
+ # Examples
247
+ with gr.Accordion("๐Ÿ’ก Example Usage", open=False):
248
+ gr.Markdown("""
249
+ ### ๐ŸŽฏ Perfect Use Cases:
250
+ - **Voice Acting**: Transform your voice to sound like someone else
251
+ - **Content Creation**: Make podcasts in different voices
252
+ - **Language Learning**: Hear text in your target accent
253
+ - **Accessibility**: Convert speech to preferred voice characteristics
254
 
255
+ ### ๐Ÿ“‹ Step-by-Step:
256
+ 1. **Upload Reference Audio**: 6+ seconds of the voice you want to clone
257
+ 2. **Upload Input Audio**: Speech content you want to transform
258
+ 3. **Select Language**: Choose the language of the content
259
+ 4. **Click Clone Voice**: Wait for processing (30-60 seconds)
260
+ 5. **Download Result**: New audio with same content, different voice!
261
+
262
+ ### ๐Ÿ” Example:
263
+ - **Reference**: Morgan Freeman speaking
264
+ - **Input**: Your voice saying "Hello world"
265
+ - **Result**: "Hello world" in Morgan Freeman's voice style
266
+ """)
267
+
268
+ # Event handler
269
+ clone_btn.click(
270
+ fn=voice_to_voice_clone,
271
+ inputs=[reference_audio, input_audio, language],
272
+ outputs=[output_audio, status_output],
273
+ show_progress=True
274
+ )
275
 
276
  if __name__ == "__main__":
277
  demo.launch()