OpenVoice

Sleeping

App Files Files Community

Chuatury commited on Jun 6, 2025

Commit

028cdeb

unverified ·

1 Parent(s): 416031b

upgrade

Browse files

Files changed (5) hide show

README.md +1 -1
app.py +5 -6
openvoice/api.py +45 -23
openvoice/se_extractor.py +101 -88
requirements.txt +4 -4

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🤗
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 3.48.0
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 5.33.0
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -35,23 +35,22 @@ tone_color_converter = ToneColorConverter(
 )
 tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
 def predict(speaker_wav, transform_wav):
     # initialize a empty info
     text_hint = ""
     # extract source_se
-    source_se, _ = se_extractor.get_se(
         transform_wav,
         tone_color_converter,
-        vad=True,
     )
     # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
     try:
-        target_se, _ = se_extractor.get_se(
             speaker_wav,
             tone_color_converter,
-            vad=True,
         )
     except Exception as e:
         text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
@@ -86,13 +85,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Column():
             ref_gr = gr.Audio(
                 label="Reference Audio",
-                info="Click on the ✎ button to upload your own target speaker audio",
                 type="filepath",
                 value="examples/speaker0.mp3",
             )
             tra_gr = gr.Audio(
                 label="Transform Audio",
-                info="Click on the ✎ button to upload your own target transform audio",
                 type="filepath",
                 value=None,
             )

 )
 tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
 def predict(speaker_wav, transform_wav):
     # initialize a empty info
     text_hint = ""
     # extract source_se
+    source_se = se_extractor.get_se(
         transform_wav,
         tone_color_converter,
     )
     # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
     try:
+        target_se = se_extractor.get_se(
             speaker_wav,
             tone_color_converter,
         )
     except Exception as e:
         text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
         with gr.Column():
             ref_gr = gr.Audio(
                 label="Reference Audio",
+                # info="Click on the ✎ button to upload your own target speaker audio",
                 type="filepath",
                 value="examples/speaker0.mp3",
             )
             tra_gr = gr.Audio(
                 label="Transform Audio",
+                # info="Click on the ✎ button to upload your own target transform audio",
                 type="filepath",
                 value=None,
             )

openvoice/api.py CHANGED Viewed

@@ -9,16 +9,14 @@ from openvoice.models import SynthesizerTrn
 class OpenVoiceBaseClass(object):
-    def __init__(self,
-                config_path,
-                device='cuda:0'):
-        if 'cuda' in device:
             assert torch.cuda.is_available()
         hps = utils.get_hparams_from_file(config_path)
         model = SynthesizerTrn(
-            len(getattr(hps, 'symbols', [])),
             hps.data.filter_length // 2 + 1,
             n_speakers=hps.data.n_speakers,
             **hps.model,
@@ -31,32 +29,37 @@ class OpenVoiceBaseClass(object):
     def load_ckpt(self, ckpt_path):
         checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
-        a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
         print("Loaded checkpoint '{}'".format(ckpt_path))
-        print('missing/unexpected keys:', a, b)
 class ToneColorConverter(OpenVoiceBaseClass):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.version = getattr(self.hps, '_version_', "v1")
     def extract_se(self, ref_wav_list, se_save_path=None):
-        if isinstance(ref_wav_list, str):
-            ref_wav_list = [ref_wav_list]
         device = self.device
         hps = self.hps
         gs = []
         for fname in ref_wav_list:
             audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
             y = torch.FloatTensor(audio_ref)
             y = y.to(device)
             y = y.unsqueeze(0)
-            y = spectrogram_torch(y, hps.data.filter_length,
-                                        hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
-                                        center=False).to(device)
             with torch.no_grad():
                 g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
                 gs.append(g.detach())
@@ -68,23 +71,42 @@ class ToneColorConverter(OpenVoiceBaseClass):
         return gs
-    def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
         hps = self.hps
         # load audio
         audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
         audio = torch.tensor(audio).float()
         with torch.no_grad():
             y = torch.FloatTensor(audio).to(self.device)
             y = y.unsqueeze(0)
-            spec = spectrogram_torch(y, hps.data.filter_length,
-                                    hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
-                                    center=False).to(self.device)
             spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
-            audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
-                        0, 0].data.cpu().float().numpy()
             if output_path is None:
                 return audio
             else:
                 soundfile.write(output_path, audio, hps.data.sampling_rate)

 class OpenVoiceBaseClass(object):
+    def __init__(self, config_path, device="cuda:0"):
+        if "cuda" in device:
             assert torch.cuda.is_available()
         hps = utils.get_hparams_from_file(config_path)
         model = SynthesizerTrn(
+            len(getattr(hps, "symbols", [])),
             hps.data.filter_length // 2 + 1,
             n_speakers=hps.data.n_speakers,
             **hps.model,
     def load_ckpt(self, ckpt_path):
         checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
+        a, b = self.model.load_state_dict(checkpoint_dict["model"], strict=False)
         print("Loaded checkpoint '{}'".format(ckpt_path))
+        print("missing/unexpected keys:", a, b)
 class ToneColorConverter(OpenVoiceBaseClass):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
+        self.version = getattr(self.hps, "_version_", "v1")
     def extract_se(self, ref_wav_list, se_save_path=None):
+        # if isinstance(ref_wav_list, str):
+        #     ref_wav_list = [ref_wav_list]
         device = self.device
         hps = self.hps
         gs = []
         for fname in ref_wav_list:
             audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
             y = torch.FloatTensor(audio_ref)
             y = y.to(device)
             y = y.unsqueeze(0)
+            y = spectrogram_torch(
+                y,
+                hps.data.filter_length,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                center=False,
+            ).to(device)
             with torch.no_grad():
                 g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
                 gs.append(g.detach())
         return gs
+    def convert(
+        self,
+        audio_src_path,
+        src_se,
+        tgt_se,
+        output_path=None,
+        tau=0.3,
+    ):
         hps = self.hps
         # load audio
         audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
         audio = torch.tensor(audio).float()
         with torch.no_grad():
             y = torch.FloatTensor(audio).to(self.device)
             y = y.unsqueeze(0)
+            spec = spectrogram_torch(
+                y,
+                hps.data.filter_length,
+                hps.data.sampling_rate,
+                hps.data.hop_length,
+                hps.data.win_length,
+                center=False,
+            ).to(self.device)
             spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
+            audio = (
+                self.model.voice_conversion(
+                    spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau
+                )[0][0, 0]
+                .data.cpu()
+                .float()
+                .numpy()
+            )
             if output_path is None:
                 return audio
             else:
                 soundfile.write(output_path, audio, hps.data.sampling_rate)

openvoice/se_extractor.py CHANGED Viewed

@@ -7,7 +7,9 @@ import base64
 from glob import glob
 import numpy as np
 from pydub import AudioSegment
-from faster_whisper import WhisperModel
 import hashlib
 import base64
 import librosa
@@ -16,65 +18,72 @@ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
 model_size = "medium"
 # Run on GPU with FP16
 model = None
-def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
-    global model
-    if model is None:
-        model = WhisperModel(model_size, device="cuda", compute_type="float16")
-    audio = AudioSegment.from_file(audio_path)
-    max_len = len(audio)
-    target_folder = os.path.join(target_dir, audio_name)
-    segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
-    segments = list(segments)
-    # create directory
-    os.makedirs(target_folder, exist_ok=True)
-    wavs_folder = os.path.join(target_folder, 'wavs')
-    os.makedirs(wavs_folder, exist_ok=True)
-    # segments
-    s_ind = 0
-    start_time = None
-    for k, w in enumerate(segments):
-        # process with the time
-        if k == 0:
-            start_time = max(0, w.start)
-        end_time = w.end
-        # calculate confidence
-        if len(w.words) > 0:
-            confidence = sum([s.probability for s in w.words]) / len(w.words)
-        else:
-            confidence = 0.
-        # clean text
-        text = w.text.replace('...', '')
-        # left 0.08s for each audios
-        audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
-        # segment file name
-        fname = f"{audio_name}_seg{s_ind}.wav"
-        # filter out the segment shorter than 1.5s and longer than 20s
-        save = audio_seg.duration_seconds > 1.5 and \
-                audio_seg.duration_seconds < 20. and \
-                len(text) >= 2 and len(text) < 200
-        if save:
-            output_file = os.path.join(wavs_folder, fname)
-            audio_seg.export(output_file, format='wav')
-        if k < len(segments) - 1:
-            start_time = max(0, segments[k+1].start - 0.08)
-        s_ind = s_ind + 1
-    return wavs_folder
-def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
     SAMPLE_RATE = 16000
     audio_vad = get_audio_tensor(audio_path)
     segments = get_vad_segments(
@@ -85,69 +94,73 @@ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
         method="silero",
     )
     segments = [(seg["start"], seg["end"]) for seg in segments]
-    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
-    print(segments)
     audio_active = AudioSegment.silent(duration=0)
     audio = AudioSegment.from_file(audio_path)
     for start_time, end_time in segments:
-        audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
     audio_dur = audio_active.duration_seconds
-    print(f'after vad: dur = {audio_dur}')
-    target_folder = os.path.join(target_dir, audio_name)
-    wavs_folder = os.path.join(target_folder, 'wavs')
-    os.makedirs(wavs_folder, exist_ok=True)
-    start_time = 0.
     count = 0
     num_splits = int(np.round(audio_dur / split_seconds))
-    assert num_splits > 0, 'input audio is too short'
     interval = audio_dur / num_splits
     for i in range(num_splits):
         end_time = min(start_time + interval, audio_dur)
         if i == num_splits - 1:
             end_time = audio_dur
-        output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
-        audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
-        audio_seg.export(output_file, format='wav')
         start_time = end_time
         count += 1
-    return wavs_folder
-def hash_numpy_array(audio_path):
-    array, _ = librosa.load(audio_path, sr=None, mono=True)
-    # Convert the array to bytes
-    array_bytes = array.tobytes()
-    # Calculate the hash of the array bytes
-    hash_object = hashlib.sha256(array_bytes)
-    hash_value = hash_object.digest()
-    # Convert the hash value to base64
-    base64_value = base64.b64encode(hash_value)
-    return base64_value.decode('utf-8')[:16].replace('/', '_^')
-def get_se(audio_path, vc_model, target_dir='processed', vad=True):
-    device = vc_model.device
     version = vc_model.version
     print("OpenVoice version:", version)
-    audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
-    se_path = os.path.join(target_dir, audio_name, 'se.pth')
     # if os.path.isfile(se_path):
     #     se = torch.load(se_path).to(device)
     #     return se, audio_name
     # if os.path.isdir(audio_path):
     #     wavs_folder = audio_path
-    if vad:
-        wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
-    else:
-        wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
-    audio_segs = glob(f'{wavs_folder}/*.wav')
-    if len(audio_segs) == 0:
-        raise NotImplementedError('No audio segments found!')
-    return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name

 from glob import glob
 import numpy as np
 from pydub import AudioSegment
+from io import BytesIO
+# from faster_whisper import WhisperModel
 import hashlib
 import base64
 import librosa
 model_size = "medium"
 # Run on GPU with FP16
 model = None
+# def split_audio_whisper(audio_path, audio_name, target_dir="processed"):
+#     global model
+#     if model is None:
+#         model = WhisperModel(model_size, device="cuda", compute_type="float16")
+#     audio = AudioSegment.from_file(audio_path)
+#     max_len = len(audio)
+#     target_folder = os.path.join(target_dir, audio_name)
+#     segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
+#     segments = list(segments)
+#     # create directory
+#     os.makedirs(target_folder, exist_ok=True)
+#     wavs_folder = os.path.join(target_folder, "wavs")
+#     os.makedirs(wavs_folder, exist_ok=True)
+#     # segments
+#     s_ind = 0
+#     start_time = None
+#     for k, w in enumerate(segments):
+#         # process with the time
+#         if k == 0:
+#             start_time = max(0, w.start)
+#         end_time = w.end
+#         # calculate confidence
+#         if len(w.words) > 0:
+#             confidence = sum([s.probability for s in w.words]) / len(w.words)
+#         else:
+#             confidence = 0.0
+#         # clean text
+#         text = w.text.replace("...", "")
+#         # left 0.08s for each audios
+#         audio_seg = audio[
+#             int(start_time * 1000) : min(max_len, int(end_time * 1000) + 80)
+#         ]
+#         # segment file name
+#         fname = f"{audio_name}_seg{s_ind}.wav"
+#         # filter out the segment shorter than 1.5s and longer than 20s
+#         save = (
+#             audio_seg.duration_seconds > 1.5
+#             and audio_seg.duration_seconds < 20.0
+#             and len(text) >= 2
+#             and len(text) < 200
+#         )
+#         if save:
+#             output_file = os.path.join(wavs_folder, fname)
+#             audio_seg.export(output_file, format="wav")
+#         if k < len(segments) - 1:
+#             start_time = max(0, segments[k + 1].start - 0.08)
+#         s_ind = s_ind + 1
+#     return wavs_folder
+def split_audio_vad(audio_path, split_seconds=10.0):
     SAMPLE_RATE = 16000
     audio_vad = get_audio_tensor(audio_path)
     segments = get_vad_segments(
         method="silero",
     )
     segments = [(seg["start"], seg["end"]) for seg in segments]
+    segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s, e in segments]
+    # print(segments)
     audio_active = AudioSegment.silent(duration=0)
     audio = AudioSegment.from_file(audio_path)
     for start_time, end_time in segments:
+        audio_active += audio[int(start_time * 1000) : int(end_time * 1000)]
     audio_dur = audio_active.duration_seconds
+    # print(f"after vad: dur = {audio_dur}")
+    start_time = 0.0
     count = 0
     num_splits = int(np.round(audio_dur / split_seconds))
+    assert num_splits > 0, "input audio is too short"
     interval = audio_dur / num_splits
+    wavs = []
     for i in range(num_splits):
         end_time = min(start_time + interval, audio_dur)
         if i == num_splits - 1:
             end_time = audio_dur
+        # output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
+        io = BytesIO()
+        audio_seg = audio_active[int(start_time * 1000) : int(end_time * 1000)]
+        audio_seg.export(io, format="wav")
+        io.seek(0)
+        wavs.append(io)
         start_time = end_time
         count += 1
+    return wavs
+# def hash_numpy_array(audio_path):
+#     array, _ = librosa.load(audio_path, sr=None, mono=True)
+#     # Convert the array to bytes
+#     array_bytes = array.tobytes()
+#     # Calculate the hash of the array bytes
+#     hash_object = hashlib.sha256(array_bytes)
+#     hash_value = hash_object.digest()
+#     # Convert the hash value to base64
+#     base64_value = base64.b64encode(hash_value)
+#     return base64_value.decode("utf-8")[:16].replace("/", "_^")
+def get_se(audio_path, vc_model, target_dir="processed"):
+    # device = vc_model.device
     version = vc_model.version
     print("OpenVoice version:", version)
+    # audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
+    # se_path = os.path.join(target_dir, audio_name, "se.pth")
     # if os.path.isfile(se_path):
     #     se = torch.load(se_path).to(device)
     #     return se, audio_name
     # if os.path.isdir(audio_path):
     #     wavs_folder = audio_path
+    # if vad:
+    wavs = split_audio_vad(audio_path)
+    # else:
+    #     wavs_folder = split_audio_whisper(
+    #         audio_path, target_dir=target_dir, audio_name=audio_name
+    #     )
+    return vc_model.extract_se(wavs)

requirements.txt CHANGED Viewed

@@ -1,8 +1,8 @@
-librosa==0.9.1
-faster-whisper==0.9.0
 pydub==0.25.1
-numpy==1.22.0
 whisper-timestamped==1.14.2
-openai
 torch
 torchaudio

+librosa==0.11.0
+#faster-whisper==0.9.0
 pydub==0.25.1
+numpy
 whisper-timestamped==1.14.2
+#openai
 torch
 torchaudio