remove watermark
Browse files- OpenVoice/api.py +1 -51
- app_locally.py +0 -2
- requirements.txt +0 -1
OpenVoice/api.py
CHANGED
|
@@ -103,14 +103,6 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 103 |
def __init__(self, *args, **kwargs):
|
| 104 |
super().__init__(*args, **kwargs)
|
| 105 |
|
| 106 |
-
if kwargs.get('enable_watermark', True):
|
| 107 |
-
import wavmark
|
| 108 |
-
self.watermark_model = wavmark.load_model().to(self.device)
|
| 109 |
-
else:
|
| 110 |
-
self.watermark_model = None
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
def extract_se(self, ref_wav_list, se_save_path=None):
|
| 115 |
if isinstance(ref_wav_list, str):
|
| 116 |
ref_wav_list = [ref_wav_list]
|
|
@@ -138,7 +130,7 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 138 |
|
| 139 |
return gs
|
| 140 |
|
| 141 |
-
def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3
|
| 142 |
hps = self.hps
|
| 143 |
# load audio
|
| 144 |
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
|
@@ -153,50 +145,8 @@ class ToneColorConverter(OpenVoiceBaseClass):
|
|
| 153 |
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
| 154 |
audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
|
| 155 |
0, 0].data.cpu().float().numpy()
|
| 156 |
-
audio = self.add_watermark(audio, message)
|
| 157 |
if output_path is None:
|
| 158 |
return audio
|
| 159 |
else:
|
| 160 |
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
| 161 |
|
| 162 |
-
def add_watermark(self, audio, message):
|
| 163 |
-
if self.watermark_model is None:
|
| 164 |
-
return audio
|
| 165 |
-
device = self.device
|
| 166 |
-
bits = utils.string_to_bits(message).reshape(-1)
|
| 167 |
-
n_repeat = len(bits) // 32
|
| 168 |
-
|
| 169 |
-
K = 16000
|
| 170 |
-
coeff = 2
|
| 171 |
-
for n in range(n_repeat):
|
| 172 |
-
trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
|
| 173 |
-
if len(trunck) != K:
|
| 174 |
-
print('Audio too short, fail to add watermark')
|
| 175 |
-
break
|
| 176 |
-
message_npy = bits[n * 32: (n + 1) * 32]
|
| 177 |
-
|
| 178 |
-
with torch.no_grad():
|
| 179 |
-
signal = torch.FloatTensor(trunck).to(device)[None]
|
| 180 |
-
message_tensor = torch.FloatTensor(message_npy).to(device)[None]
|
| 181 |
-
signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
|
| 182 |
-
signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
|
| 183 |
-
audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
|
| 184 |
-
return audio
|
| 185 |
-
|
| 186 |
-
def detect_watermark(self, audio, n_repeat):
|
| 187 |
-
bits = []
|
| 188 |
-
K = 16000
|
| 189 |
-
coeff = 2
|
| 190 |
-
for n in range(n_repeat):
|
| 191 |
-
trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
|
| 192 |
-
if len(trunck) != K:
|
| 193 |
-
print('Audio too short, fail to detect watermark')
|
| 194 |
-
return 'Fail'
|
| 195 |
-
with torch.no_grad():
|
| 196 |
-
signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
|
| 197 |
-
message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
|
| 198 |
-
bits.append(message_decoded_npy)
|
| 199 |
-
bits = np.stack(bits).reshape(-1, 8)
|
| 200 |
-
message = utils.bits_to_string(bits)
|
| 201 |
-
return message
|
| 202 |
-
|
|
|
|
| 103 |
def __init__(self, *args, **kwargs):
|
| 104 |
super().__init__(*args, **kwargs)
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
def extract_se(self, ref_wav_list, se_save_path=None):
|
| 107 |
if isinstance(ref_wav_list, str):
|
| 108 |
ref_wav_list = [ref_wav_list]
|
|
|
|
| 130 |
|
| 131 |
return gs
|
| 132 |
|
| 133 |
+
def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3):
|
| 134 |
hps = self.hps
|
| 135 |
# load audio
|
| 136 |
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
|
|
|
| 145 |
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
| 146 |
audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
|
| 147 |
0, 0].data.cpu().float().numpy()
|
|
|
|
| 148 |
if output_path is None:
|
| 149 |
return audio
|
| 150 |
else:
|
| 151 |
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
| 152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_locally.py
CHANGED
|
@@ -118,13 +118,11 @@ def predict(prompt, speaker_wav, transform_wav):
|
|
| 118 |
|
| 119 |
save_path = f"{output_dir}/output.wav"
|
| 120 |
# Run the tone color converter
|
| 121 |
-
encode_message = "@MyShell"
|
| 122 |
tone_color_converter.convert(
|
| 123 |
audio_src_path=src_path,
|
| 124 |
src_se=source_se,
|
| 125 |
tgt_se=target_se,
|
| 126 |
output_path=save_path,
|
| 127 |
-
message=encode_message,
|
| 128 |
)
|
| 129 |
|
| 130 |
text_hint += f"""Get response successfully \n"""
|
|
|
|
| 118 |
|
| 119 |
save_path = f"{output_dir}/output.wav"
|
| 120 |
# Run the tone color converter
|
|
|
|
| 121 |
tone_color_converter.convert(
|
| 122 |
audio_src_path=src_path,
|
| 123 |
src_se=source_se,
|
| 124 |
tgt_se=target_se,
|
| 125 |
output_path=save_path,
|
|
|
|
| 126 |
)
|
| 127 |
|
| 128 |
text_hint += f"""Get response successfully \n"""
|
requirements.txt
CHANGED
|
@@ -2,7 +2,6 @@ langid
|
|
| 2 |
librosa==0.9.1
|
| 3 |
faster-whisper==0.9.0
|
| 4 |
pydub==0.25.1
|
| 5 |
-
wavmark==0.0.2
|
| 6 |
numpy==1.22.0
|
| 7 |
eng_to_ipa==0.0.2
|
| 8 |
inflect==7.0.0
|
|
|
|
| 2 |
librosa==0.9.1
|
| 3 |
faster-whisper==0.9.0
|
| 4 |
pydub==0.25.1
|
|
|
|
| 5 |
numpy==1.22.0
|
| 6 |
eng_to_ipa==0.0.2
|
| 7 |
inflect==7.0.0
|