tester1hf commited on
Commit
a60e434
·
verified ·
1 Parent(s): 561919f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -61
app.py CHANGED
@@ -6,6 +6,7 @@ from pydub import AudioSegment
6
  import os
7
  import re
8
  import soundfile as sf
 
9
 
10
  # Security bypass and TOS agreement
11
  os.environ["COQUI_TOS_AGREED"] = "1"
@@ -19,19 +20,22 @@ torch.load = patched_torch_load
19
 
20
  # Initialize XTTS model
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
- tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
23
 
24
  def extract_speaker_embedding(audio_path):
25
- # Get conditioning latents using built-in method
26
- gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
27
-
28
- # Save both latents for better voice cloning
29
- embedding_path = "speaker_embedding.pth"
30
- torch.save({
31
- "gpt_cond_latent": gpt_cond_latent.cpu(),
32
- "speaker_embedding": speaker_embedding.cpu()
33
- }, embedding_path)
34
- return embedding_path
 
 
 
35
 
36
  def split_text(text, max_length=182):
37
  sentences = []
@@ -59,63 +63,91 @@ def split_text(text, max_length=182):
59
  return processed
60
 
61
  def synthesize_speech(text, embedding_path):
62
- # Load embeddings
63
- embeddings = torch.load(embedding_path)
64
- gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
65
- speaker_embedding = embeddings["speaker_embedding"].to(device)
66
-
67
- # Split text into manageable chunks
68
- text_chunks = split_text(text)
69
-
70
- # Synthesize each chunk
71
- audio_chunks = []
72
- for chunk in text_chunks:
73
- wav = tts.synthesizer.tts_model.inference(
74
- text=chunk,
75
- language="ru",
76
- gpt_cond_latent=gpt_cond_latent,
77
- speaker_embedding=speaker_embedding,
78
- temperature=0.7,
79
- length_penalty=1.0,
80
- repetition_penalty=2.0,
81
- )
82
- audio_chunks.append(np.array(wav["wav"].squeeze().cpu().numpy()))
83
-
84
- # Combine and save audio
85
- full_audio = np.concatenate(audio_chunks)
86
- output_path = "output.wav"
87
- sf.write(output_path, full_audio, 24000)
88
- return output_path
 
 
 
 
 
 
89
 
90
  # Gradio Interface
91
- with gr.Blocks() as demo:
92
- gr.Markdown("# XTTS v2 Voice Cloning Demo")
93
 
94
- with gr.Tab("1. Extract Voice Embedding"):
95
- gr.Markdown("Upload a Russian audio sample (3-10 seconds)")
96
  with gr.Row():
97
- audio_input = gr.Audio(type="filepath", label="Input Audio")
98
- embedding_output = gr.File(label="Embedding File")
99
- extract_btn = gr.Button("Create Voice Embedding")
100
- extract_btn.click(
101
- extract_speaker_embedding,
102
- inputs=audio_input,
103
- outputs=embedding_output
104
- )
105
 
106
- with gr.Tab("2. Generate Speech"):
107
  gr.Markdown("Upload embedding and enter Russian text")
108
  with gr.Row():
109
- text_input = gr.Textbox(label="Text", lines=4, placeholder="Enter text here...")
110
- embedding_input = gr.File(label="Embedding File")
 
 
 
 
 
111
  with gr.Row():
112
- audio_output = gr.Audio(label="Generated Speech", autoplay=True)
113
- synth_btn = gr.Button("Generate Speech")
114
- synth_btn.click(
115
- synthesize_speech,
116
- inputs=[text_input, embedding_input],
117
- outputs=audio_output
118
- )
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  if __name__ == "__main__":
121
- demo.launch(server_port=7860, share=True)
 
 
 
 
 
 
6
  import os
7
  import re
8
  import soundfile as sf
9
+ import time
10
 
11
  # Security bypass and TOS agreement
12
  os.environ["COQUI_TOS_AGREED"] = "1"
 
20
 
21
  # Initialize XTTS model
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
24
 
25
  def extract_speaker_embedding(audio_path):
26
+ try:
27
+ # Get conditioning latents using built-in method
28
+ gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
29
+
30
+ # Save both latents
31
+ embedding_path = "speaker_embedding.pth"
32
+ torch.save({
33
+ "gpt_cond_latent": gpt_cond_latent.cpu(),
34
+ "speaker_embedding": speaker_embedding.cpu()
35
+ }, embedding_path)
36
+ return embedding_path
37
+ except Exception as e:
38
+ raise gr.Error(f"Error extracting embedding: {str(e)}")
39
 
40
  def split_text(text, max_length=182):
41
  sentences = []
 
63
  return processed
64
 
65
  def synthesize_speech(text, embedding_path):
66
+ try:
67
+ # Load embeddings
68
+ embeddings = torch.load(embedding_path)
69
+ gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
70
+ speaker_embedding = embeddings["speaker_embedding"].to(device)
71
+
72
+ # Split text into chunks
73
+ text_chunks = split_text(text)
74
+
75
+ # Synthesize each chunk
76
+ audio_chunks = []
77
+ for chunk in text_chunks:
78
+ start_time = time.time()
79
+ out = tts.synthesizer.tts_model.inference(
80
+ chunk,
81
+ "ru",
82
+ gpt_cond_latent,
83
+ speaker_embedding,
84
+ temperature=0.7,
85
+ length_penalty=1.0,
86
+ repetition_penalty=2.0,
87
+ )
88
+ # Convert tensor to numpy array properly
89
+ audio = out["wav"].squeeze().cpu().numpy()
90
+ audio_chunks.append(audio)
91
+
92
+ # Combine and save audio
93
+ full_audio = np.concatenate(audio_chunks)
94
+ output_path = "output.wav"
95
+ sf.write(output_path, full_audio, 24000)
96
+ return output_path
97
+ except Exception as e:
98
+ raise gr.Error(f"Error generating speech: {str(e)}")
99
 
100
  # Gradio Interface
101
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
102
+ gr.Markdown("# 🐸 XTTS v2 Voice Cloning Demo")
103
 
104
+ with gr.Tab("🔊 Voice Embedding Creation"):
105
+ gr.Markdown("Upload a short Russian audio sample (3-10 seconds)")
106
  with gr.Row():
107
+ audio_input = gr.Audio(
108
+ sources=["upload", "microphone"],
109
+ type="filepath",
110
+ label="Input Audio",
111
+ waveform_options={"sample_rate": 24000}
112
+ )
113
+ embedding_output = gr.File(label="Saved Embedding")
114
+ extract_btn = gr.Button("Create Voice Embedding", variant="primary")
115
 
116
+ with gr.Tab("���� Speech Generation"):
117
  gr.Markdown("Upload embedding and enter Russian text")
118
  with gr.Row():
119
+ text_input = gr.Textbox(
120
+ label="Text Input",
121
+ placeholder="Enter text to synthesize...",
122
+ lines=4,
123
+ max_lines=10
124
+ )
125
+ embedding_input = gr.File(label="Upload Embedding File")
126
  with gr.Row():
127
+ audio_output = gr.Audio(
128
+ label="Generated Speech",
129
+ autoplay=True,
130
+ waveform_options={"sample_rate": 24000}
131
+ )
132
+ synth_btn = gr.Button("Generate Speech", variant="primary")
133
+
134
+ # Event handlers
135
+ extract_btn.click(
136
+ extract_speaker_embedding,
137
+ inputs=audio_input,
138
+ outputs=embedding_output
139
+ )
140
+
141
+ synth_btn.click(
142
+ synthesize_speech,
143
+ inputs=[text_input, embedding_input],
144
+ outputs=audio_output
145
+ )
146
 
147
  if __name__ == "__main__":
148
+ demo.launch(
149
+ server_name="0.0.0.0",
150
+ server_port=7860,
151
+ share=False,
152
+ show_error=True
153
+ )