Chuatury commited on
Commit
028cdeb
·
unverified ·
1 Parent(s): 416031b
Files changed (5) hide show
  1. README.md +1 -1
  2. app.py +5 -6
  3. openvoice/api.py +45 -23
  4. openvoice/se_extractor.py +101 -88
  5. requirements.txt +4 -4
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🤗
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.48.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.33.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -35,23 +35,22 @@ tone_color_converter = ToneColorConverter(
35
  )
36
  tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
37
 
 
38
  def predict(speaker_wav, transform_wav):
39
  # initialize a empty info
40
  text_hint = ""
41
 
42
  # extract source_se
43
- source_se, _ = se_extractor.get_se(
44
  transform_wav,
45
  tone_color_converter,
46
- vad=True,
47
  )
48
 
49
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
50
  try:
51
- target_se, _ = se_extractor.get_se(
52
  speaker_wav,
53
  tone_color_converter,
54
- vad=True,
55
  )
56
  except Exception as e:
57
  text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
@@ -86,13 +85,13 @@ with gr.Blocks(analytics_enabled=False) as demo:
86
  with gr.Column():
87
  ref_gr = gr.Audio(
88
  label="Reference Audio",
89
- info="Click on the ✎ button to upload your own target speaker audio",
90
  type="filepath",
91
  value="examples/speaker0.mp3",
92
  )
93
  tra_gr = gr.Audio(
94
  label="Transform Audio",
95
- info="Click on the ✎ button to upload your own target transform audio",
96
  type="filepath",
97
  value=None,
98
  )
 
35
  )
36
  tone_color_converter.load_ckpt(f"{ckpt_converter}/checkpoint.pth")
37
 
38
+
39
  def predict(speaker_wav, transform_wav):
40
  # initialize a empty info
41
  text_hint = ""
42
 
43
  # extract source_se
44
+ source_se = se_extractor.get_se(
45
  transform_wav,
46
  tone_color_converter,
 
47
  )
48
 
49
  # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
50
  try:
51
+ target_se = se_extractor.get_se(
52
  speaker_wav,
53
  tone_color_converter,
 
54
  )
55
  except Exception as e:
56
  text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
 
85
  with gr.Column():
86
  ref_gr = gr.Audio(
87
  label="Reference Audio",
88
+ # info="Click on the ✎ button to upload your own target speaker audio",
89
  type="filepath",
90
  value="examples/speaker0.mp3",
91
  )
92
  tra_gr = gr.Audio(
93
  label="Transform Audio",
94
+ # info="Click on the ✎ button to upload your own target transform audio",
95
  type="filepath",
96
  value=None,
97
  )
openvoice/api.py CHANGED
@@ -9,16 +9,14 @@ from openvoice.models import SynthesizerTrn
9
 
10
 
11
  class OpenVoiceBaseClass(object):
12
- def __init__(self,
13
- config_path,
14
- device='cuda:0'):
15
- if 'cuda' in device:
16
  assert torch.cuda.is_available()
17
 
18
  hps = utils.get_hparams_from_file(config_path)
19
 
20
  model = SynthesizerTrn(
21
- len(getattr(hps, 'symbols', [])),
22
  hps.data.filter_length // 2 + 1,
23
  n_speakers=hps.data.n_speakers,
24
  **hps.model,
@@ -31,32 +29,37 @@ class OpenVoiceBaseClass(object):
31
 
32
  def load_ckpt(self, ckpt_path):
33
  checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
34
- a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
35
  print("Loaded checkpoint '{}'".format(ckpt_path))
36
- print('missing/unexpected keys:', a, b)
37
 
38
 
39
  class ToneColorConverter(OpenVoiceBaseClass):
40
  def __init__(self, *args, **kwargs):
41
  super().__init__(*args, **kwargs)
42
- self.version = getattr(self.hps, '_version_', "v1")
43
 
44
  def extract_se(self, ref_wav_list, se_save_path=None):
45
- if isinstance(ref_wav_list, str):
46
- ref_wav_list = [ref_wav_list]
47
-
48
  device = self.device
49
  hps = self.hps
50
  gs = []
51
-
52
  for fname in ref_wav_list:
53
  audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
54
  y = torch.FloatTensor(audio_ref)
55
  y = y.to(device)
56
  y = y.unsqueeze(0)
57
- y = spectrogram_torch(y, hps.data.filter_length,
58
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
59
- center=False).to(device)
 
 
 
 
 
60
  with torch.no_grad():
61
  g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
62
  gs.append(g.detach())
@@ -68,23 +71,42 @@ class ToneColorConverter(OpenVoiceBaseClass):
68
 
69
  return gs
70
 
71
- def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
 
 
 
 
 
 
 
72
  hps = self.hps
73
  # load audio
74
  audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
75
  audio = torch.tensor(audio).float()
76
-
77
  with torch.no_grad():
78
  y = torch.FloatTensor(audio).to(self.device)
79
  y = y.unsqueeze(0)
80
- spec = spectrogram_torch(y, hps.data.filter_length,
81
- hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
82
- center=False).to(self.device)
 
 
 
 
 
83
  spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
84
- audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
85
- 0, 0].data.cpu().float().numpy()
 
 
 
 
 
 
 
 
86
  if output_path is None:
87
  return audio
88
  else:
89
  soundfile.write(output_path, audio, hps.data.sampling_rate)
90
-
 
9
 
10
 
11
  class OpenVoiceBaseClass(object):
12
+ def __init__(self, config_path, device="cuda:0"):
13
+ if "cuda" in device:
 
 
14
  assert torch.cuda.is_available()
15
 
16
  hps = utils.get_hparams_from_file(config_path)
17
 
18
  model = SynthesizerTrn(
19
+ len(getattr(hps, "symbols", [])),
20
  hps.data.filter_length // 2 + 1,
21
  n_speakers=hps.data.n_speakers,
22
  **hps.model,
 
29
 
30
  def load_ckpt(self, ckpt_path):
31
  checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
32
+ a, b = self.model.load_state_dict(checkpoint_dict["model"], strict=False)
33
  print("Loaded checkpoint '{}'".format(ckpt_path))
34
+ print("missing/unexpected keys:", a, b)
35
 
36
 
37
  class ToneColorConverter(OpenVoiceBaseClass):
38
  def __init__(self, *args, **kwargs):
39
  super().__init__(*args, **kwargs)
40
+ self.version = getattr(self.hps, "_version_", "v1")
41
 
42
  def extract_se(self, ref_wav_list, se_save_path=None):
43
+ # if isinstance(ref_wav_list, str):
44
+ # ref_wav_list = [ref_wav_list]
45
+
46
  device = self.device
47
  hps = self.hps
48
  gs = []
49
+
50
  for fname in ref_wav_list:
51
  audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
52
  y = torch.FloatTensor(audio_ref)
53
  y = y.to(device)
54
  y = y.unsqueeze(0)
55
+ y = spectrogram_torch(
56
+ y,
57
+ hps.data.filter_length,
58
+ hps.data.sampling_rate,
59
+ hps.data.hop_length,
60
+ hps.data.win_length,
61
+ center=False,
62
+ ).to(device)
63
  with torch.no_grad():
64
  g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
65
  gs.append(g.detach())
 
71
 
72
  return gs
73
 
74
+ def convert(
75
+ self,
76
+ audio_src_path,
77
+ src_se,
78
+ tgt_se,
79
+ output_path=None,
80
+ tau=0.3,
81
+ ):
82
  hps = self.hps
83
  # load audio
84
  audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
85
  audio = torch.tensor(audio).float()
86
+
87
  with torch.no_grad():
88
  y = torch.FloatTensor(audio).to(self.device)
89
  y = y.unsqueeze(0)
90
+ spec = spectrogram_torch(
91
+ y,
92
+ hps.data.filter_length,
93
+ hps.data.sampling_rate,
94
+ hps.data.hop_length,
95
+ hps.data.win_length,
96
+ center=False,
97
+ ).to(self.device)
98
  spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
99
+
100
+ audio = (
101
+ self.model.voice_conversion(
102
+ spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau
103
+ )[0][0, 0]
104
+ .data.cpu()
105
+ .float()
106
+ .numpy()
107
+ )
108
+
109
  if output_path is None:
110
  return audio
111
  else:
112
  soundfile.write(output_path, audio, hps.data.sampling_rate)
 
openvoice/se_extractor.py CHANGED
@@ -7,7 +7,9 @@ import base64
7
  from glob import glob
8
  import numpy as np
9
  from pydub import AudioSegment
10
- from faster_whisper import WhisperModel
 
 
11
  import hashlib
12
  import base64
13
  import librosa
@@ -16,65 +18,72 @@ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
16
  model_size = "medium"
17
  # Run on GPU with FP16
18
  model = None
19
- def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
20
- global model
21
- if model is None:
22
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
23
- audio = AudioSegment.from_file(audio_path)
24
- max_len = len(audio)
25
 
26
- target_folder = os.path.join(target_dir, audio_name)
27
-
28
- segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
29
- segments = list(segments)
30
 
31
- # create directory
32
- os.makedirs(target_folder, exist_ok=True)
33
- wavs_folder = os.path.join(target_folder, 'wavs')
34
- os.makedirs(wavs_folder, exist_ok=True)
 
 
 
 
 
 
 
35
 
36
- # segments
37
- s_ind = 0
38
- start_time = None
39
-
40
- for k, w in enumerate(segments):
41
- # process with the time
42
- if k == 0:
43
- start_time = max(0, w.start)
44
 
45
- end_time = w.end
 
 
46
 
47
- # calculate confidence
48
- if len(w.words) > 0:
49
- confidence = sum([s.probability for s in w.words]) / len(w.words)
50
- else:
51
- confidence = 0.
52
- # clean text
53
- text = w.text.replace('...', '')
54
 
55
- # left 0.08s for each audios
56
- audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
57
 
58
- # segment file name
59
- fname = f"{audio_name}_seg{s_ind}.wav"
 
 
 
 
 
60
 
61
- # filter out the segment shorter than 1.5s and longer than 20s
62
- save = audio_seg.duration_seconds > 1.5 and \
63
- audio_seg.duration_seconds < 20. and \
64
- len(text) >= 2 and len(text) < 200
65
 
66
- if save:
67
- output_file = os.path.join(wavs_folder, fname)
68
- audio_seg.export(output_file, format='wav')
69
 
70
- if k < len(segments) - 1:
71
- start_time = max(0, segments[k+1].start - 0.08)
 
 
 
 
 
72
 
73
- s_ind = s_ind + 1
74
- return wavs_folder
 
75
 
 
 
76
 
77
- def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
 
 
 
 
78
  SAMPLE_RATE = 16000
79
  audio_vad = get_audio_tensor(audio_path)
80
  segments = get_vad_segments(
@@ -85,69 +94,73 @@ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
85
  method="silero",
86
  )
87
  segments = [(seg["start"], seg["end"]) for seg in segments]
88
- segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
89
- print(segments)
90
  audio_active = AudioSegment.silent(duration=0)
91
  audio = AudioSegment.from_file(audio_path)
92
 
93
  for start_time, end_time in segments:
94
- audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
95
-
96
  audio_dur = audio_active.duration_seconds
97
- print(f'after vad: dur = {audio_dur}')
98
- target_folder = os.path.join(target_dir, audio_name)
99
- wavs_folder = os.path.join(target_folder, 'wavs')
100
- os.makedirs(wavs_folder, exist_ok=True)
101
- start_time = 0.
102
  count = 0
103
  num_splits = int(np.round(audio_dur / split_seconds))
104
- assert num_splits > 0, 'input audio is too short'
105
  interval = audio_dur / num_splits
106
 
 
107
  for i in range(num_splits):
108
  end_time = min(start_time + interval, audio_dur)
109
  if i == num_splits - 1:
110
  end_time = audio_dur
111
- output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
112
- audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
113
- audio_seg.export(output_file, format='wav')
 
 
 
 
 
114
  start_time = end_time
115
  count += 1
116
- return wavs_folder
117
-
118
- def hash_numpy_array(audio_path):
119
- array, _ = librosa.load(audio_path, sr=None, mono=True)
120
- # Convert the array to bytes
121
- array_bytes = array.tobytes()
122
- # Calculate the hash of the array bytes
123
- hash_object = hashlib.sha256(array_bytes)
124
- hash_value = hash_object.digest()
125
- # Convert the hash value to base64
126
- base64_value = base64.b64encode(hash_value)
127
- return base64_value.decode('utf-8')[:16].replace('/', '_^')
128
-
129
- def get_se(audio_path, vc_model, target_dir='processed', vad=True):
130
- device = vc_model.device
 
 
 
131
  version = vc_model.version
132
  print("OpenVoice version:", version)
133
 
134
- audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
135
- se_path = os.path.join(target_dir, audio_name, 'se.pth')
136
 
137
  # if os.path.isfile(se_path):
138
  # se = torch.load(se_path).to(device)
139
  # return se, audio_name
140
  # if os.path.isdir(audio_path):
141
  # wavs_folder = audio_path
142
-
143
- if vad:
144
- wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
145
- else:
146
- wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
147
-
148
- audio_segs = glob(f'{wavs_folder}/*.wav')
149
- if len(audio_segs) == 0:
150
- raise NotImplementedError('No audio segments found!')
151
-
152
- return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
153
 
 
 
 
 
 
 
 
 
 
 
7
  from glob import glob
8
  import numpy as np
9
  from pydub import AudioSegment
10
+ from io import BytesIO
11
+
12
+ # from faster_whisper import WhisperModel
13
  import hashlib
14
  import base64
15
  import librosa
 
18
  model_size = "medium"
19
  # Run on GPU with FP16
20
  model = None
 
 
 
 
 
 
21
 
 
 
 
 
22
 
23
+ # def split_audio_whisper(audio_path, audio_name, target_dir="processed"):
24
+ # global model
25
+ # if model is None:
26
+ # model = WhisperModel(model_size, device="cuda", compute_type="float16")
27
+ # audio = AudioSegment.from_file(audio_path)
28
+ # max_len = len(audio)
29
+
30
+ # target_folder = os.path.join(target_dir, audio_name)
31
+
32
+ # segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
33
+ # segments = list(segments)
34
 
35
+ # # create directory
36
+ # os.makedirs(target_folder, exist_ok=True)
37
+ # wavs_folder = os.path.join(target_folder, "wavs")
38
+ # os.makedirs(wavs_folder, exist_ok=True)
 
 
 
 
39
 
40
+ # # segments
41
+ # s_ind = 0
42
+ # start_time = None
43
 
44
+ # for k, w in enumerate(segments):
45
+ # # process with the time
46
+ # if k == 0:
47
+ # start_time = max(0, w.start)
 
 
 
48
 
49
+ # end_time = w.end
 
50
 
51
+ # # calculate confidence
52
+ # if len(w.words) > 0:
53
+ # confidence = sum([s.probability for s in w.words]) / len(w.words)
54
+ # else:
55
+ # confidence = 0.0
56
+ # # clean text
57
+ # text = w.text.replace("...", "")
58
 
59
+ # # left 0.08s for each audios
60
+ # audio_seg = audio[
61
+ # int(start_time * 1000) : min(max_len, int(end_time * 1000) + 80)
62
+ # ]
63
 
64
+ # # segment file name
65
+ # fname = f"{audio_name}_seg{s_ind}.wav"
 
66
 
67
+ # # filter out the segment shorter than 1.5s and longer than 20s
68
+ # save = (
69
+ # audio_seg.duration_seconds > 1.5
70
+ # and audio_seg.duration_seconds < 20.0
71
+ # and len(text) >= 2
72
+ # and len(text) < 200
73
+ # )
74
 
75
+ # if save:
76
+ # output_file = os.path.join(wavs_folder, fname)
77
+ # audio_seg.export(output_file, format="wav")
78
 
79
+ # if k < len(segments) - 1:
80
+ # start_time = max(0, segments[k + 1].start - 0.08)
81
 
82
+ # s_ind = s_ind + 1
83
+ # return wavs_folder
84
+
85
+
86
+ def split_audio_vad(audio_path, split_seconds=10.0):
87
  SAMPLE_RATE = 16000
88
  audio_vad = get_audio_tensor(audio_path)
89
  segments = get_vad_segments(
 
94
  method="silero",
95
  )
96
  segments = [(seg["start"], seg["end"]) for seg in segments]
97
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s, e in segments]
98
+ # print(segments)
99
  audio_active = AudioSegment.silent(duration=0)
100
  audio = AudioSegment.from_file(audio_path)
101
 
102
  for start_time, end_time in segments:
103
+ audio_active += audio[int(start_time * 1000) : int(end_time * 1000)]
104
+
105
  audio_dur = audio_active.duration_seconds
106
+ # print(f"after vad: dur = {audio_dur}")
107
+ start_time = 0.0
 
 
 
108
  count = 0
109
  num_splits = int(np.round(audio_dur / split_seconds))
110
+ assert num_splits > 0, "input audio is too short"
111
  interval = audio_dur / num_splits
112
 
113
+ wavs = []
114
  for i in range(num_splits):
115
  end_time = min(start_time + interval, audio_dur)
116
  if i == num_splits - 1:
117
  end_time = audio_dur
118
+ # output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
119
+
120
+ io = BytesIO()
121
+ audio_seg = audio_active[int(start_time * 1000) : int(end_time * 1000)]
122
+ audio_seg.export(io, format="wav")
123
+ io.seek(0)
124
+ wavs.append(io)
125
+
126
  start_time = end_time
127
  count += 1
128
+
129
+ return wavs
130
+
131
+
132
+ # def hash_numpy_array(audio_path):
133
+ # array, _ = librosa.load(audio_path, sr=None, mono=True)
134
+ # # Convert the array to bytes
135
+ # array_bytes = array.tobytes()
136
+ # # Calculate the hash of the array bytes
137
+ # hash_object = hashlib.sha256(array_bytes)
138
+ # hash_value = hash_object.digest()
139
+ # # Convert the hash value to base64
140
+ # base64_value = base64.b64encode(hash_value)
141
+ # return base64_value.decode("utf-8")[:16].replace("/", "_^")
142
+
143
+
144
+ def get_se(audio_path, vc_model, target_dir="processed"):
145
+ # device = vc_model.device
146
  version = vc_model.version
147
  print("OpenVoice version:", version)
148
 
149
+ # audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
150
+ # se_path = os.path.join(target_dir, audio_name, "se.pth")
151
 
152
  # if os.path.isfile(se_path):
153
  # se = torch.load(se_path).to(device)
154
  # return se, audio_name
155
  # if os.path.isdir(audio_path):
156
  # wavs_folder = audio_path
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ # if vad:
159
+ wavs = split_audio_vad(audio_path)
160
+
161
+ # else:
162
+ # wavs_folder = split_audio_whisper(
163
+ # audio_path, target_dir=target_dir, audio_name=audio_name
164
+ # )
165
+
166
+ return vc_model.extract_se(wavs)
requirements.txt CHANGED
@@ -1,8 +1,8 @@
1
- librosa==0.9.1
2
- faster-whisper==0.9.0
3
  pydub==0.25.1
4
- numpy==1.22.0
5
  whisper-timestamped==1.14.2
6
- openai
7
  torch
8
  torchaudio
 
1
+ librosa==0.11.0
2
+ #faster-whisper==0.9.0
3
  pydub==0.25.1
4
+ numpy
5
  whisper-timestamped==1.14.2
6
+ #openai
7
  torch
8
  torchaudio