Spaces:

tester1hf
/

tests

Sleeping

App Files Files Community

tester1hf commited on Feb 18, 2025

Commit

561919f

verified ·

1 Parent(s): 4b0bfb1

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -36

app.py CHANGED Viewed

@@ -7,10 +7,10 @@ import os
 import re
 import soundfile as sf
-# Bypass security and agree to Coqui TOS
 os.environ["COQUI_TOS_AGREED"] = "1"
-# Patch torch.load
 original_torch_load = torch.load
 def patched_torch_load(*args, **kwargs):
     kwargs['weights_only'] = False
@@ -22,28 +22,18 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 def extract_speaker_embedding(audio_path):
-    # Load and process audio
-    audio = AudioSegment.from_file(audio_path)
-    audio = audio.set_channels(1).set_frame_rate(16000)  # XTTS requires 16kHz
-    # Convert to numpy array and normalize
-    audio_array = np.array(audio.get_array_of_samples()).astype(np.float32)
-    audio_array /= np.max(np.abs(audio_array))
-    # Convert to tensor
-    audio_tensor = torch.from_numpy(audio_array).unsqueeze(0).to(device)
-    # Extract embedding
-    with torch.no_grad():
-        embedding = tts.synthesizer.tts_model.speaker_manager.encoder(audio_tensor)
-    # Save embedding
     embedding_path = "speaker_embedding.pth"
-    torch.save(embedding.cpu(), embedding_path)
     return embedding_path
 def split_text(text, max_length=182):
-    # Split text into chunks with proper punctuation
     sentences = []
     current = []
     current_len = 0
@@ -60,49 +50,53 @@ def split_text(text, max_length=182):
     if current:
         sentences.append("".join(current).strip())
-    # Ensure sentences end with punctuation
     processed = []
     for s in sentences:
-        if not s.endswith(('.', '!', '?')):
             s += '.'
         processed.append(s)
     return processed
 def synthesize_speech(text, embedding_path):
-    # Load embedding
-    embedding = torch.load(embedding_path).to(device)
-    # Split text
     text_chunks = split_text(text)
     # Synthesize each chunk
     audio_chunks = []
     for chunk in text_chunks:
-        wav = tts.tts(
             text=chunk,
-            speaker_wav=None,
-            speaker_embedding=embedding,
             language="ru",
         )
-        audio_chunks.append(np.array(wav))
-    # Combine audio
     full_audio = np.concatenate(audio_chunks)
     output_path = "output.wav"
-    sf.write(output_path, full_audio, 24000)  # XTTS uses 24kHz output
     return output_path
 # Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("# XTTS v2 Speech Synthesis")
     with gr.Tab("1. Extract Voice Embedding"):
-        gr.Markdown("Upload Russian speech sample (10-60 seconds)")
         with gr.Row():
             audio_input = gr.Audio(type="filepath", label="Input Audio")
-            embedding_output = gr.File(label="Voice Embedding File")
-        extract_btn = gr.Button("Extract Embedding")
         extract_btn.click(
             extract_speaker_embedding,
             inputs=audio_input,
@@ -112,8 +106,8 @@ with gr.Blocks() as demo:
     with gr.Tab("2. Generate Speech"):
         gr.Markdown("Upload embedding and enter Russian text")
         with gr.Row():
-            text_input = gr.Textbox(label="Input Text", lines=4, placeholder="Enter text in Russian...")
-            embedding_input = gr.File(label="Upload Embedding File")
         with gr.Row():
             audio_output = gr.Audio(label="Generated Speech", autoplay=True)
         synth_btn = gr.Button("Generate Speech")

 import re
 import soundfile as sf
+# Security bypass and TOS agreement
 os.environ["COQUI_TOS_AGREED"] = "1"
+# Patch torch.load for embedding loading
 original_torch_load = torch.load
 def patched_torch_load(*args, **kwargs):
     kwargs['weights_only'] = False
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
 def extract_speaker_embedding(audio_path):
+    # Get conditioning latents using built-in method
+    gpt_cond_latent, speaker_embedding = tts.synthesizer.tts_model.get_conditioning_latents(audio_path=[audio_path])
+    # Save both latents for better voice cloning
     embedding_path = "speaker_embedding.pth"
+    torch.save({
+        "gpt_cond_latent": gpt_cond_latent.cpu(),
+        "speaker_embedding": speaker_embedding.cpu()
+    }, embedding_path)
     return embedding_path
 def split_text(text, max_length=182):
     sentences = []
     current = []
     current_len = 0
     if current:
         sentences.append("".join(current).strip())
     processed = []
     for s in sentences:
+        if not s.endswith(('.','!','?')):
             s += '.'
         processed.append(s)
     return processed
 def synthesize_speech(text, embedding_path):
+    # Load embeddings
+    embeddings = torch.load(embedding_path)
+    gpt_cond_latent = embeddings["gpt_cond_latent"].to(device)
+    speaker_embedding = embeddings["speaker_embedding"].to(device)
+    # Split text into manageable chunks
     text_chunks = split_text(text)
     # Synthesize each chunk
     audio_chunks = []
     for chunk in text_chunks:
+        wav = tts.synthesizer.tts_model.inference(
             text=chunk,
             language="ru",
+            gpt_cond_latent=gpt_cond_latent,
+            speaker_embedding=speaker_embedding,
+            temperature=0.7,
+            length_penalty=1.0,
+            repetition_penalty=2.0,
         )
+        audio_chunks.append(np.array(wav["wav"].squeeze().cpu().numpy()))
+    # Combine and save audio
     full_audio = np.concatenate(audio_chunks)
     output_path = "output.wav"
+    sf.write(output_path, full_audio, 24000)
     return output_path
 # Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("# XTTS v2 Voice Cloning Demo")
     with gr.Tab("1. Extract Voice Embedding"):
+        gr.Markdown("Upload a Russian audio sample (3-10 seconds)")
         with gr.Row():
             audio_input = gr.Audio(type="filepath", label="Input Audio")
+            embedding_output = gr.File(label="Embedding File")
+        extract_btn = gr.Button("Create Voice Embedding")
         extract_btn.click(
             extract_speaker_embedding,
             inputs=audio_input,
     with gr.Tab("2. Generate Speech"):
         gr.Markdown("Upload embedding and enter Russian text")
         with gr.Row():
+            text_input = gr.Textbox(label="Text", lines=4, placeholder="Enter text here...")
+            embedding_input = gr.File(label="Embedding File")
         with gr.Row():
             audio_output = gr.Audio(label="Generated Speech", autoplay=True)
         synth_btn = gr.Button("Generate Speech")