crackuser commited on
Commit
f758d08
·
verified ·
1 Parent(s): 8581048

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -500
app.py CHANGED
@@ -1,515 +1,65 @@
1
- import streamlit as st
2
  import torch
3
  import torchaudio
4
  import numpy as np
5
- import librosa
6
- import soundfile as sf
7
- import matplotlib.pyplot as plt
8
- import plotly.graph_objects as go
9
- import plotly.express as px
10
- from scipy.signal import butter, filtfilt
11
  import tempfile
12
  import os
13
- import io
14
- import base64
15
- from datetime import datetime
16
- import requests
17
- import zipfile
18
- from pathlib import Path
19
- import pickle
20
- import json
21
 
22
- # Import voice cloning modules
23
- from voice_cloning_engine import VoiceCloningEngine
24
- from audio_processor import AudioProcessor
25
- from voice_analyzer import VoiceAnalyzer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- # Page configuration
28
- st.set_page_config(
29
- page_title="AI Voice Clone Studio",
30
- page_icon="🎭",
31
- layout="wide",
32
- initial_sidebar_state="expanded"
33
- )
34
-
35
- # Custom CSS
36
- st.markdown("""
37
- <style>
38
- .main-header {
39
- font-size: 3rem;
40
- font-weight: bold;
41
- text-align: center;
42
- margin-bottom: 2rem;
43
- background: linear-gradient(90deg, #ff6b6b, #4ecdc4, #45b7d1);
44
- -webkit-background-clip: text;
45
- -webkit-text-fill-color: transparent;
46
- background-clip: text;
47
- }
48
- .clone-box {
49
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
50
- padding: 2rem;
51
- border-radius: 15px;
52
- color: white;
53
- margin: 1rem 0;
54
- }
55
- .reference-box {
56
- background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
57
- padding: 1.5rem;
58
- border-radius: 10px;
59
- color: white;
60
- margin: 1rem 0;
61
- }
62
- .input-box {
63
- background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%);
64
- padding: 1.5rem;
65
- border-radius: 10px;
66
- color: white;
67
- margin: 1rem 0;
68
- }
69
- .result-box {
70
- background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%);
71
- padding: 1.5rem;
72
- border-radius: 10px;
73
- color: white;
74
- margin: 1rem 0;
75
- }
76
- .stAudio {
77
- margin: 1rem 0;
78
- }
79
- </style>
80
- """, unsafe_allow_html=True)
81
-
82
- # Initialize session state
83
- if 'cloning_engine' not in st.session_state:
84
- st.session_state.cloning_engine = None
85
- if 'reference_voice' not in st.session_state:
86
- st.session_state.reference_voice = None
87
- if 'cloned_audio' not in st.session_state:
88
- st.session_state.cloned_audio = None
89
- if 'voice_profiles' not in st.session_state:
90
- st.session_state.voice_profiles = {}
91
-
92
- @st.cache_resource
93
- def load_cloning_engine():
94
- """Initialize the voice cloning engine"""
95
- return VoiceCloningEngine()
96
-
97
- def save_uploaded_file(uploaded_file, directory="temp"):
98
- """Save uploaded file to directory"""
99
- if uploaded_file is not None:
100
- os.makedirs(directory, exist_ok=True)
101
- file_path = os.path.join(directory, uploaded_file.name)
102
- with open(file_path, "wb") as f:
103
- f.write(uploaded_file.getbuffer())
104
- return file_path
105
- return None
106
-
107
- def create_audio_comparison(original_audio, cloned_audio, sample_rate):
108
- """Create side-by-side audio comparison"""
109
- fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
110
-
111
- # Original audio
112
- time_original = np.linspace(0, len(original_audio) / sample_rate, len(original_audio))
113
- ax1.plot(time_original, original_audio, color='blue', alpha=0.7)
114
- ax1.set_title('Original Audio', fontsize=14, fontweight='bold')
115
- ax1.set_xlabel('Time (seconds)')
116
- ax1.set_ylabel('Amplitude')
117
- ax1.grid(True, alpha=0.3)
118
-
119
- # Cloned audio
120
- time_cloned = np.linspace(0, len(cloned_audio) / sample_rate, len(cloned_audio))
121
- ax2.plot(time_cloned, cloned_audio, color='red', alpha=0.7)
122
- ax2.set_title('Voice Cloned Audio', fontsize=14, fontweight='bold')
123
- ax2.set_xlabel('Time (seconds)')
124
- ax2.set_ylabel('Amplitude')
125
- ax2.grid(True, alpha=0.3)
126
-
127
- plt.tight_layout()
128
- return fig
129
-
130
- def create_spectrogram_comparison(original_audio, cloned_audio, sample_rate):
131
- """Create spectrogram comparison"""
132
- fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
133
-
134
- # Original spectrogram
135
- D1 = librosa.amplitude_to_db(np.abs(librosa.stft(original_audio)), ref=np.max)
136
- librosa.display.specshow(D1, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax1, cmap='viridis')
137
- ax1.set_title('Original Audio Spectrogram')
138
-
139
- # Cloned spectrogram
140
- D2 = librosa.amplitude_to_db(np.abs(librosa.stft(cloned_audio)), ref=np.max)
141
- librosa.display.specshow(D2, sr=sample_rate, x_axis='time', y_axis='hz', ax=ax2, cmap='viridis')
142
- ax2.set_title('Voice Cloned Audio Spectrogram')
143
-
144
- plt.tight_layout()
145
- return fig
146
-
147
- def main():
148
- # Header
149
- st.markdown('<div class="main-header">🎭 AI Voice Clone Studio</div>', unsafe_allow_html=True)
150
- st.markdown("### Transform any voice into any other voice with advanced AI")
151
 
152
- # Initialize cloning engine
153
- if st.session_state.cloning_engine is None:
154
- with st.spinner("🚀 Loading Voice Cloning Engine..."):
155
- st.session_state.cloning_engine = load_cloning_engine()
156
-
157
- # Sidebar Configuration
158
- with st.sidebar:
159
- st.header("⚙️ Voice Cloning Settings")
160
-
161
- # Model Selection
162
- cloning_method = st.selectbox(
163
- "Cloning Method:",
164
- ["OpenVoice", "Real-Time VC", "SV2TTS", "Neural Voice Puppetry"],
165
- help="Choose the voice cloning algorithm"
166
- )
167
-
168
- # Quality Settings
169
- st.subheader("🎛️ Quality Settings")
170
- quality_level = st.select_slider(
171
- "Quality Level:",
172
- options=["Fast", "Balanced", "High Quality"],
173
- value="Balanced"
174
- )
175
-
176
- preserve_emotion = st.checkbox("Preserve Emotion", value=True)
177
- preserve_accent = st.checkbox("Preserve Accent", value=True)
178
- preserve_pace = st.checkbox("Preserve Speaking Pace", value=True)
179
-
180
- # Advanced Settings
181
- with st.expander("🔧 Advanced Settings"):
182
- similarity_threshold = st.slider("Voice Similarity Threshold", 0.5, 1.0, 0.8)
183
- noise_reduction = st.checkbox("Apply Noise Reduction", value=True)
184
- auto_trim = st.checkbox("Auto-trim Silence", value=True)
185
- enhance_quality = st.checkbox("Enhance Audio Quality", value=True)
186
-
187
- # Main Interface
188
- col1, col2 = st.columns([1, 1])
189
-
190
- # Reference Voice Section
191
- with col1:
192
- st.markdown("""
193
- <div class="reference-box">
194
- <h3>🎤 Reference Voice (Target)</h3>
195
- <p>Upload or record the voice you want to clone</p>
196
- </div>
197
- """, unsafe_allow_html=True)
198
-
199
- reference_method = st.radio(
200
- "Reference Voice Input:",
201
- ["Upload Audio File", "Record Live", "Use Saved Profile"],
202
- horizontal=True
203
- )
204
-
205
- reference_audio_data = None
206
- reference_sr = None
207
-
208
- if reference_method == "Upload Audio File":
209
- reference_file = st.file_uploader(
210
- "Upload Reference Voice:",
211
- type=['wav', 'mp3', 'flac', 'm4a'],
212
- help="Upload a clear audio sample of the target voice (10+ seconds recommended)"
213
  )
214
-
215
- if reference_file:
216
- file_path = save_uploaded_file(reference_file, "reference_voices")
217
- reference_audio_data, reference_sr = librosa.load(file_path, sr=None)
218
- st.audio(reference_file, format='audio/wav')
219
-
220
- # Voice Analysis
221
- if st.button("🔍 Analyze Reference Voice"):
222
- with st.spinner("Analyzing voice characteristics..."):
223
- analyzer = VoiceAnalyzer()
224
- voice_features = analyzer.analyze_voice(reference_audio_data, reference_sr)
225
-
226
- st.json(voice_features)
227
-
228
- elif reference_method == "Record Live":
229
- st.info("🎙️ Use the record button below to capture reference voice")
230
- # Audio recorder component would go here
231
- # For now, showing placeholder
232
- st.warning("Live recording feature requires additional setup")
233
-
234
- elif reference_method == "Use Saved Profile":
235
- if st.session_state.voice_profiles:
236
- selected_profile = st.selectbox(
237
- "Select Voice Profile:",
238
- list(st.session_state.voice_profiles.keys())
239
- )
240
-
241
- if selected_profile:
242
- profile_data = st.session_state.voice_profiles[selected_profile]
243
- reference_audio_data = profile_data['audio_data']
244
- reference_sr = profile_data['sample_rate']
245
- st.success(f"✅ Loaded voice profile: {selected_profile}")
246
- else:
247
- st.info("No saved voice profiles available")
248
-
249
- # Input Audio Section
250
- with col2:
251
- st.markdown("""
252
- <div class="input-box">
253
- <h3>📢 Input Audio (Source)</h3>
254
- <p>Upload the audio you want to transform</p>
255
- </div>
256
- """, unsafe_allow_html=True)
257
-
258
- input_method = st.radio(
259
- "Input Audio Method:",
260
- ["Upload Audio File", "Record Live", "Text-to-Speech"],
261
- horizontal=True
262
- )
263
-
264
- input_audio_data = None
265
- input_sr = None
266
-
267
- if input_method == "Upload Audio File":
268
- input_file = st.file_uploader(
269
- "Upload Input Audio:",
270
- type=['wav', 'mp3', 'flac', 'm4a'],
271
- help="Upload the audio you want to transform to the reference voice"
272
- )
273
-
274
- if input_file:
275
- file_path = save_uploaded_file(input_file, "temp")
276
- input_audio_data, input_sr = librosa.load(file_path, sr=None)
277
- st.audio(input_file, format='audio/wav')
278
-
279
- elif input_method == "Record Live":
280
- st.info("🎙️ Use the record button below to capture input audio")
281
- st.warning("Live recording feature requires additional setup")
282
-
283
- elif input_method == "Text-to-Speech":
284
- tts_text = st.text_area(
285
- "Enter text to convert:",
286
- height=150,
287
- placeholder="Type the text you want to speak in the cloned voice..."
288
  )
289
-
290
- if tts_text and st.button("🗣️ Generate TTS"):
291
- with st.spinner("Generating speech from text..."):
292
- # Generate TTS audio (placeholder)
293
- st.success("TTS generated! Now clone the voice.")
294
-
295
- # Voice Cloning Process
296
- if reference_audio_data is not None and input_audio_data is not None:
297
- st.markdown("---")
298
- st.markdown("""
299
- <div class="clone-box">
300
- <h2>🎭 Voice Cloning Process</h2>
301
- <p>Ready to clone the reference voice and apply it to your input audio!</p>
302
- </div>
303
- """, unsafe_allow_html=True)
304
-
305
- col1, col2, col3 = st.columns([1, 2, 1])
306
-
307
- with col2:
308
- if st.button("🚀 Start Voice Cloning", type="primary", use_container_width=True):
309
- try:
310
- with st.spinner("🎭 Cloning voice... This may take a few minutes"):
311
- progress_bar = st.progress(0)
312
- status_text = st.empty()
313
-
314
- # Step 1: Preprocess audio
315
- status_text.text("📊 Preprocessing audio...")
316
- progress_bar.progress(20)
317
-
318
- processor = AudioProcessor()
319
- ref_processed = processor.preprocess_audio(reference_audio_data, reference_sr)
320
- input_processed = processor.preprocess_audio(input_audio_data, input_sr)
321
-
322
- # Step 2: Extract voice features
323
- status_text.text("🔍 Extracting voice features...")
324
- progress_bar.progress(40)
325
-
326
- # Step 3: Voice cloning
327
- status_text.text("🎭 Performing voice cloning...")
328
- progress_bar.progress(60)
329
-
330
- cloned_audio = st.session_state.cloning_engine.clone_voice(
331
- reference_audio=ref_processed,
332
- input_audio=input_processed,
333
- method=cloning_method,
334
- preserve_emotion=preserve_emotion,
335
- preserve_accent=preserve_accent,
336
- preserve_pace=preserve_pace
337
- )
338
-
339
- # Step 4: Post-processing
340
- status_text.text("✨ Post-processing...")
341
- progress_bar.progress(80)
342
-
343
- if enhance_quality:
344
- cloned_audio = processor.enhance_audio(cloned_audio)
345
-
346
- progress_bar.progress(100)
347
- status_text.text("✅ Voice cloning completed!")
348
-
349
- # Store result
350
- st.session_state.cloned_audio = {
351
- 'audio_data': cloned_audio,
352
- 'sample_rate': input_sr,
353
- 'original_input': input_audio_data,
354
- 'reference_voice': reference_audio_data
355
- }
356
-
357
- st.success("🎉 Voice cloning successful!")
358
-
359
- except Exception as e:
360
- st.error(f"❌ Error during voice cloning: {str(e)}")
361
-
362
- # Results Section
363
- if st.session_state.cloned_audio:
364
- st.markdown("---")
365
- st.markdown("""
366
- <div class="result-box">
367
- <h2>🎵 Cloning Results</h2>
368
- <p>Your voice has been successfully cloned!</p>
369
- </div>
370
- """, unsafe_allow_html=True)
371
-
372
- cloned_data = st.session_state.cloned_audio
373
-
374
- # Audio Players
375
- st.subheader("🔊 Audio Comparison")
376
-
377
- col1, col2, col3 = st.columns(3)
378
-
379
- with col1:
380
- st.markdown("**📢 Original Input:**")
381
- input_bytes = AudioProcessor.audio_to_bytes(cloned_data['original_input'], cloned_data['sample_rate'])
382
- st.audio(input_bytes, format='audio/wav')
383
-
384
- with col2:
385
- st.markdown("**🎤 Reference Voice:**")
386
- ref_bytes = AudioProcessor.audio_to_bytes(cloned_data['reference_voice'], cloned_data['sample_rate'])
387
- st.audio(ref_bytes, format='audio/wav')
388
-
389
- with col3:
390
- st.markdown("**🎭 Cloned Result:**")
391
- cloned_bytes = AudioProcessor.audio_to_bytes(cloned_data['audio_data'], cloned_data['sample_rate'])
392
- st.audio(cloned_bytes, format='audio/wav')
393
-
394
- # Visualizations
395
- st.subheader("📊 Audio Analysis")
396
-
397
- tab1, tab2, tab3 = st.tabs(["Waveform Comparison", "Spectrogram Analysis", "Voice Similarity"])
398
-
399
- with tab1:
400
- fig_wave = create_audio_comparison(
401
- cloned_data['original_input'],
402
- cloned_data['audio_data'],
403
- cloned_data['sample_rate']
404
- )
405
- st.pyplot(fig_wave)
406
-
407
- with tab2:
408
- fig_spec = create_spectrogram_comparison(
409
- cloned_data['original_input'],
410
- cloned_data['audio_data'],
411
- cloned_data['sample_rate']
412
- )
413
- st.pyplot(fig_spec)
414
-
415
- with tab3:
416
- # Voice similarity metrics
417
- analyzer = VoiceAnalyzer()
418
- similarity_score = analyzer.calculate_similarity(
419
- cloned_data['reference_voice'],
420
- cloned_data['audio_data'],
421
- cloned_data['sample_rate']
422
- )
423
-
424
- # Create similarity gauge
425
- fig_gauge = go.Figure(go.Indicator(
426
- mode = "gauge+number+delta",
427
- value = similarity_score * 100,
428
- domain = {'x': [0, 1], 'y': [0, 1]},
429
- title = {'text': "Voice Similarity Score"},
430
- delta = {'reference': 80},
431
- gauge = {
432
- 'axis': {'range': [None, 100]},
433
- 'bar': {'color': "darkblue"},
434
- 'steps': [
435
- {'range': [0, 50], 'color': "lightgray"},
436
- {'range': [50, 80], 'color': "gray"}
437
- ],
438
- 'threshold': {
439
- 'line': {'color': "red", 'width': 4},
440
- 'thickness': 0.75,
441
- 'value': 90
442
- }
443
- }
444
- ))
445
-
446
- st.plotly_chart(fig_gauge, use_container_width=True)
447
-
448
- # Download Options
449
- st.subheader("💾 Download Options")
450
-
451
- col1, col2, col3 = st.columns(3)
452
-
453
- with col1:
454
- st.download_button(
455
- label="⬇️ Download WAV",
456
- data=cloned_bytes,
457
- file_name=f"voice_cloned_{datetime.now().strftime('%Y%m%d_%H%M%S')}.wav",
458
- mime="audio/wav"
459
- )
460
-
461
- with col2:
462
- # Convert to MP3 and download
463
- if st.button("⬇️ Download MP3"):
464
- st.info("MP3 conversion feature coming soon!")
465
-
466
- with col3:
467
- # Save as voice profile
468
- profile_name = st.text_input("Voice Profile Name:", placeholder="My Voice Clone")
469
- if st.button("💾 Save Profile") and profile_name:
470
- st.session_state.voice_profiles[profile_name] = {
471
- 'audio_data': cloned_data['reference_voice'],
472
- 'sample_rate': cloned_data['sample_rate'],
473
- 'created': datetime.now().isoformat()
474
- }
475
- st.success(f"✅ Voice profile '{profile_name}' saved!")
476
-
477
- # Voice Profile Manager
478
- if st.session_state.voice_profiles:
479
- st.markdown("---")
480
- st.subheader("👤 Voice Profile Manager")
481
 
482
- for profile_name, profile_data in st.session_state.voice_profiles.items():
483
- col1, col2, col3 = st.columns([2, 1, 1])
484
-
485
- with col1:
486
- st.write(f"**{profile_name}**")
487
- st.caption(f"Created: {profile_data['created']}")
488
-
489
- with col2:
490
- audio_bytes = AudioProcessor.audio_to_bytes(
491
- profile_data['audio_data'],
492
- profile_data['sample_rate']
493
- )
494
- st.audio(audio_bytes, format='audio/wav')
495
-
496
- with col3:
497
- if st.button(f"🗑️ Delete", key=f"del_{profile_name}"):
498
- del st.session_state.voice_profiles[profile_name]
499
- st.rerun()
500
 
501
- # Footer
502
- st.markdown("---")
503
- st.markdown(
504
- """
505
- <div style="text-align: center; color: #666; padding: 2rem;">
506
- 🎭 <strong>AI Voice Clone Studio</strong> - Advanced Voice Cloning Technology<br>
507
- Transform any voice into any other voice with state-of-the-art AI<br>
508
- <small>⚠️ Use responsibly and with consent from voice owners</small>
509
- </div>
510
- """,
511
- unsafe_allow_html=True
512
  )
513
 
 
514
  if __name__ == "__main__":
515
- main()
 
1
+ import gradio as gr
2
  import torch
3
  import torchaudio
4
  import numpy as np
5
+ from transformers import AutoModel, AutoTokenizer
 
 
 
 
 
6
  import tempfile
7
  import os
 
 
 
 
 
 
 
 
8
 
9
+ def clone_voice(reference_audio, input_text):
10
+ """Voice cloning function"""
11
+ try:
12
+ # Your voice cloning logic here
13
+ # This is a basic template - replace with your actual model
14
+
15
+ # Load your model (replace with actual model loading)
16
+ # model = AutoModel.from_pretrained("your-model-name")
17
+
18
+ # Process the reference audio
19
+ if reference_audio is None:
20
+ return None, "Please upload reference audio"
21
+
22
+ # Simple echo for testing (replace with actual voice cloning)
23
+ # In a real implementation, you'd:
24
+ # 1. Process reference_audio to extract voice features
25
+ # 2. Generate speech from input_text using those features
26
+ # 3. Return the generated audio
27
+
28
+ # For now, return the reference audio as a test
29
+ return reference_audio, "Voice cloning completed (test mode)"
30
+
31
+ except Exception as e:
32
+ return None, f"Error: {str(e)}"
33
 
34
+ # Create Gradio interface
35
+ with gr.Blocks(title="Voice Cloning") as app:
36
+ gr.Markdown("# 🎭 AI Voice Cloning")
37
+ gr.Markdown("Upload reference audio and enter text to clone the voice.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ with gr.Row():
40
+ with gr.Column():
41
+ reference_audio = gr.Audio(
42
+ label="Reference Voice (10+ seconds)",
43
+ type="filepath"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  )
45
+ input_text = gr.Textbox(
46
+ label="Text to Convert",
47
+ placeholder="Enter the text you want to speak in the cloned voice...",
48
+ lines=3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  )
50
+ clone_btn = gr.Button("🎤 Clone Voice", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ with gr.Column():
53
+ output_audio = gr.Audio(label="Cloned Voice Output")
54
+ status_text = gr.Textbox(label="Status", interactive=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
+ # Connect the function
57
+ clone_btn.click(
58
+ fn=clone_voice,
59
+ inputs=[reference_audio, input_text],
60
+ outputs=[output_audio, status_text]
 
 
 
 
 
 
61
  )
62
 
63
+ # Launch the app
64
  if __name__ == "__main__":
65
+ app.launch()