tester1hf commited on
Commit
561919f
·
verified ·
1 Parent(s): 4b0bfb1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -36
app.py CHANGED
@@ -7,10 +7,10 @@ import os
7
  import re
8
  import soundfile as sf
9
 
10
- # Bypass security and agree to Coqui TOS
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
- # Patch torch.load
14
  original_torch_load = torch.load
15
  def patched_torch_load(*args, **kwargs):
16
  kwargs['weights_only'] = False
@@ -22,28 +22,18 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
22
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
23
 
24
  def extract_speaker_embedding(audio_path):
25
- # Load and process audio
26
- audio = AudioSegment.from_file(audio_path)
27
- audio = audio.set_channels(1).set_frame_rate(16000) # XTTS requires 16kHz
28
 
29
- # Convert to numpy array and normalize
30
- audio_array = np.array(audio.get_array_of_samples()).astype(np.float32)
31
- audio_array /= np.max(np.abs(audio_array))
32
-
33
- # Convert to tensor
34
- audio_tensor = torch.from_numpy(audio_array).unsqueeze(0).to(device)
35
-
36
- # Extract embedding
37
- with torch.no_grad():
38
- embedding = tts.synthesizer.tts_model.speaker_manager.encoder(audio_tensor)
39
-
40
- # Save embedding
41
  embedding_path = "speaker_embedding.pth"
42
- torch.save(embedding.cpu(), embedding_path)
 
 
 
43
  return embedding_path
44
 
45
  def split_text(text, max_length=182):
46
- # Split text into chunks with proper punctuation
47
  sentences = []
48
  current = []
49
  current_len = 0
@@ -60,49 +50,53 @@ def split_text(text, max_length=182):
60
  if current:
61
  sentences.append("".join(current).strip())
62
 
63
- # Ensure sentences end with punctuation
64
  processed = []
65
  for s in sentences:
66
- if not s.endswith(('.', '!', '?')):
67
  s += '.'
68
  processed.append(s)
69
 
70
  return processed
71
 
72
  def synthesize_speech(text, embedding_path):
73
- # Load embedding
74
- embedding = torch.load(embedding_path).to(device)
 
 
75
 
76
- # Split text
77
  text_chunks = split_text(text)
78
 
79
  # Synthesize each chunk
80
  audio_chunks = []
81
  for chunk in text_chunks:
82
- wav = tts.tts(
83
  text=chunk,
84
- speaker_wav=None,
85
- speaker_embedding=embedding,
86
  language="ru",
 
 
 
 
 
87
  )
88
- audio_chunks.append(np.array(wav))
89
 
90
- # Combine audio
91
  full_audio = np.concatenate(audio_chunks)
92
  output_path = "output.wav"
93
- sf.write(output_path, full_audio, 24000) # XTTS uses 24kHz output
94
  return output_path
95
 
96
  # Gradio Interface
97
  with gr.Blocks() as demo:
98
- gr.Markdown("# XTTS v2 Speech Synthesis")
99
 
100
  with gr.Tab("1. Extract Voice Embedding"):
101
- gr.Markdown("Upload Russian speech sample (10-60 seconds)")
102
  with gr.Row():
103
  audio_input = gr.Audio(type="filepath", label="Input Audio")
104
- embedding_output = gr.File(label="Voice Embedding File")
105
- extract_btn = gr.Button("Extract Embedding")
106
  extract_btn.click(
107
  extract_speaker_embedding,
108
  inputs=audio_input,
@@ -112,8 +106,8 @@ with gr.Blocks() as demo:
112
  with gr.Tab("2. Generate Speech"):
113
  gr.Markdown("Upload embedding and enter Russian text")
114
  with gr.Row():
115
- text_input = gr.Textbox(label="Input Text", lines=4, placeholder="Enter text in Russian...")
116
- embedding_input = gr.File(label="Upload Embedding File")
117
  with gr.Row():
118
  audio_output = gr.Audio(label="Generated Speech", autoplay=True)
119
  synth_btn = gr.Button("Generate Speech")
 
7
  import re
8
  import soundfile as sf
9
 
10
+ # Security bypass and TOS agreement
11
  os.environ["COQUI_TOS_AGREED"] = "1"
12
 
13
+ # Patch torch.load for embedding loading
14
  original_torch_load = torch.load
15
  def patched_torch_load(*args, **kwargs):
16
  kwargs['weights_only'] = False
 
22
  tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
23
 
24
  def extract_speaker_embedding(audio_path):
25
+ # Get conditioning latents using built-in method
26
+ gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
 
27
 
28
+ # Save both latents for better voice cloning
 
 
 
 
 
 
 
 
 
 
 
29
  embedding_path = "speaker_embedding.pth"
30
+ torch.save({
31
+ "gpt_cond_latent": gpt_cond_latent.cpu(),
32
+ "speaker_embedding": speaker_embedding.cpu()
33
+ }, embedding_path)
34
  return embedding_path
35
 
36
  def split_text(text, max_length=182):
 
37
  sentences = []
38
  current = []
39
  current_len = 0
 
50
  if current:
51
  sentences.append("".join(current).strip())
52
 
 
53
  processed = []
54
  for s in sentences:
55
+ if not s.endswith(('.','!','?')):
56
  s += '.'
57
  processed.append(s)
58
 
59
  return processed
60
 
61
  def synthesize_speech(text, embedding_path):
62
+ # Load embeddings
63
+ embeddings = torch.load(embedding_path)
64
+ gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
65
+ speaker_embedding = embeddings["speaker_embedding"].to(device)
66
 
67
+ # Split text into manageable chunks
68
  text_chunks = split_text(text)
69
 
70
  # Synthesize each chunk
71
  audio_chunks = []
72
  for chunk in text_chunks:
73
+ wav = tts.synthesizer.tts_model.inference(
74
  text=chunk,
 
 
75
  language="ru",
76
+ gpt_cond_latent=gpt_cond_latent,
77
+ speaker_embedding=speaker_embedding,
78
+ temperature=0.7,
79
+ length_penalty=1.0,
80
+ repetition_penalty=2.0,
81
  )
82
+ audio_chunks.append(np.array(wav["wav"].squeeze().cpu().numpy()))
83
 
84
+ # Combine and save audio
85
  full_audio = np.concatenate(audio_chunks)
86
  output_path = "output.wav"
87
+ sf.write(output_path, full_audio, 24000)
88
  return output_path
89
 
90
  # Gradio Interface
91
  with gr.Blocks() as demo:
92
+ gr.Markdown("# XTTS v2 Voice Cloning Demo")
93
 
94
  with gr.Tab("1. Extract Voice Embedding"):
95
+ gr.Markdown("Upload a Russian audio sample (3-10 seconds)")
96
  with gr.Row():
97
  audio_input = gr.Audio(type="filepath", label="Input Audio")
98
+ embedding_output = gr.File(label="Embedding File")
99
+ extract_btn = gr.Button("Create Voice Embedding")
100
  extract_btn.click(
101
  extract_speaker_embedding,
102
  inputs=audio_input,
 
106
  with gr.Tab("2. Generate Speech"):
107
  gr.Markdown("Upload embedding and enter Russian text")
108
  with gr.Row():
109
+ text_input = gr.Textbox(label="Text", lines=4, placeholder="Enter text here...")
110
+ embedding_input = gr.File(label="Embedding File")
111
  with gr.Row():
112
  audio_output = gr.Audio(label="Generated Speech", autoplay=True)
113
  synth_btn = gr.Button("Generate Speech")