PhoenixStormJr commited on
Commit
dcbb157
·
verified ·
1 Parent(s): 05bc8cf

Update infer/infer-pm-index256.py

Browse files
Files changed (1) hide show
  1. infer/infer-pm-index256.py +199 -199
infer/infer-pm-index256.py CHANGED
@@ -1,199 +1,199 @@
1
- """
2
-
3
- 对源特征进行检索
4
- """
5
- import torch, pdb, os, parselmouth
6
-
7
- os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8
- import numpy as np
9
- import soundfile as sf
10
-
11
- # from models import SynthesizerTrn256#hifigan_nonsf
12
- # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
13
- from infer_pack.models import (
14
- SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
15
- ) # hifigan_nsf
16
-
17
- # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
18
- # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
19
- # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
20
-
21
-
22
- from scipy.io import wavfile
23
- from fairseq import checkpoint_utils
24
-
25
- # import pyworld
26
- import librosa
27
- import torch.nn.functional as F
28
- import scipy.signal as signal
29
-
30
- # import torchcrepe
31
- from time import time as ttime
32
-
33
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
- model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" #
35
- print("load model(s) from {}".format(model_path))
36
- models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
37
- [model_path],
38
- suffix="",
39
- )
40
- model = models[0]
41
- model = model.to(device)
42
- model = model.half()
43
- model.eval()
44
-
45
- # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
46
- # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
47
- net_g = SynthesizerTrn256(
48
- 1025,
49
- 32,
50
- 192,
51
- 192,
52
- 768,
53
- 2,
54
- 6,
55
- 3,
56
- 0,
57
- "1",
58
- [3, 7, 11],
59
- [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
60
- [10, 10, 2, 2],
61
- 512,
62
- [16, 16, 4, 4],
63
- 183,
64
- 256,
65
- is_half=True,
66
- ) # hifigan#512#256#no_dropout
67
- # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
68
- # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
69
- #
70
- # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
71
- # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
72
-
73
- # weights=torch.load("infer/ft-mi_1k-noD.pt")
74
- # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
75
- # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
76
- # weights=torch.load("infer/ft-mi-sim1k.pt")
77
- weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
78
- print(net_g.load_state_dict(weights, strict=True))
79
-
80
- net_g.eval().to(device)
81
- net_g.half()
82
-
83
-
84
- def get_f0(x, p_len, f0_up_key=0):
85
- time_step = 160 / 16000 * 1000
86
- f0_min = 50
87
- f0_max = 1100
88
- f0_mel_min = 1127 * np.log(1 + f0_min / 700)
89
- f0_mel_max = 1127 * np.log(1 + f0_max / 700)
90
-
91
- f0 = (
92
- parselmouth.Sound(x, 16000)
93
- .to_pitch_ac(
94
- time_step=time_step / 1000,
95
- voicing_threshold=0.6,
96
- pitch_floor=f0_min,
97
- pitch_ceiling=f0_max,
98
- )
99
- .selected_array["frequency"]
100
- )
101
-
102
- pad_size = (p_len - len(f0) + 1) // 2
103
- if pad_size > 0 or p_len - len(f0) - pad_size > 0:
104
- f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
105
- f0 *= pow(2, f0_up_key / 12)
106
- f0bak = f0.copy()
107
-
108
- f0_mel = 1127 * np.log(1 + f0 / 700)
109
- f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
110
- f0_mel_max - f0_mel_min
111
- ) + 1
112
- f0_mel[f0_mel <= 1] = 1
113
- f0_mel[f0_mel > 255] = 255
114
- # f0_mel[f0_mel > 188] = 188
115
- f0_coarse = np.rint(f0_mel).astype(np.int)
116
- return f0_coarse, f0bak
117
-
118
-
119
- import faiss
120
-
121
- index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
122
- big_npy = np.load("infer/big_src_feature_mi.npy")
123
- ta0 = ta1 = ta2 = 0
124
- for idx, name in enumerate(
125
- [
126
- "冬之花clip1.wav",
127
- ]
128
- ): ##
129
- wav_path = "todo-songs/%s" % name #
130
- f0_up_key = -2 #
131
- audio, sampling_rate = sf.read(wav_path)
132
- if len(audio.shape) > 1:
133
- audio = librosa.to_mono(audio.transpose(1, 0))
134
- if sampling_rate != 16000:
135
- audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
136
-
137
- feats = torch.from_numpy(audio).float()
138
- if feats.dim() == 2: # double channels
139
- feats = feats.mean(-1)
140
- assert feats.dim() == 1, feats.dim()
141
- feats = feats.view(1, -1)
142
- padding_mask = torch.BoolTensor(feats.shape).fill_(False)
143
- inputs = {
144
- "source": feats.half().to(device),
145
- "padding_mask": padding_mask.to(device),
146
- "output_layer": 9, # layer 9
147
- }
148
- if torch.cuda.is_available():
149
- torch.cuda.synchronize()
150
- t0 = ttime()
151
- with torch.no_grad():
152
- logits = model.extract_features(**inputs)
153
- feats = model.final_proj(logits[0])
154
-
155
- ####索引优化
156
- npy = feats[0].cpu().numpy().astype("float32")
157
- D, I = index.search(npy, 1)
158
- feats = (
159
- torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
160
- )
161
-
162
- feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
163
- if torch.cuda.is_available():
164
- torch.cuda.synchronize()
165
- t1 = ttime()
166
- # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
167
- p_len = min(feats.shape[1], 10000) #
168
- pitch, pitchf = get_f0(audio, p_len, f0_up_key)
169
- p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
170
- if torch.cuda.is_available():
171
- torch.cuda.synchronize()
172
- t2 = ttime()
173
- feats = feats[:, :p_len, :]
174
- pitch = pitch[:p_len]
175
- pitchf = pitchf[:p_len]
176
- p_len = torch.LongTensor([p_len]).to(device)
177
- pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
178
- sid = torch.LongTensor([0]).to(device)
179
- pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
180
- with torch.no_grad():
181
- audio = (
182
- net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
183
- .data.cpu()
184
- .float()
185
- .numpy()
186
- ) # nsf
187
- if torch.cuda.is_available():
188
- torch.cuda.synchronize()
189
- t3 = ttime()
190
- ta0 += t1 - t0
191
- ta1 += t2 - t1
192
- ta2 += t3 - t2
193
- # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
194
- # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
195
- # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
196
- wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
197
-
198
-
199
- print(ta0, ta1, ta2) #
 
1
+ """
2
+
3
+ Searching for source features
4
+ """
5
+ import torch, pdb, os, parselmouth
6
+
7
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
8
+ import numpy as np
9
+ import soundfile as sf
10
+
11
+ # from models import SynthesizerTrn256#hifigan_nonsf
12
+ # from infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf
13
+ from infer_pack.models import (
14
+ SynthesizerTrnMs256NSFsid as SynthesizerTrn256,
15
+ ) # hifigan_nsf
16
+
17
+ # from infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf
18
+ # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf
19
+ # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf
20
+
21
+
22
+ from scipy.io import wavfile
23
+ from fairseq import checkpoint_utils
24
+
25
+ # import pyworld
26
+ import librosa
27
+ import torch.nn.functional as F
28
+ import scipy.signal as signal
29
+
30
+ # import torchcrepe
31
+ from time import time as ttime
32
+
33
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" #
35
+ print("load model(s) from {}".format(model_path))
36
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
37
+ [model_path],
38
+ suffix="",
39
+ )
40
+ model = models[0]
41
+ model = model.to(device)
42
+ model = model.half()
43
+ model.eval()
44
+
45
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],183,256,is_half=True)#hifigan#512#256
46
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],109,256,is_half=True)#hifigan#512#256
47
+ net_g = SynthesizerTrn256(
48
+ 1025,
49
+ 32,
50
+ 192,
51
+ 192,
52
+ 768,
53
+ 2,
54
+ 6,
55
+ 3,
56
+ 0,
57
+ "1",
58
+ [3, 7, 11],
59
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
60
+ [10, 10, 2, 2],
61
+ 512,
62
+ [16, 16, 4, 4],
63
+ 183,
64
+ 256,
65
+ is_half=True,
66
+ ) # hifigan#512#256#no_dropout
67
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,3,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2,2],512,[16,16,4,4],0)#ts3
68
+ # net_g = SynthesizerTrn256(1025,32,192,192,768,2,6,3,0.1,"1", [3,7,11],[[1,3,5], [1,3,5], [1,3,5]],[10,10,2],512,[16,16,4],0)#hifigan-ps-sr
69
+ #
70
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [5,5], 512, [15,15], 0)#ms
71
+ # net_g = SynthesizerTrn(1025, 32, 192, 192, 768, 2, 6, 3, 0.1, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,10], 512, [16,16], 0)#idwt2
72
+
73
+ # weights=torch.load("infer/ft-mi_1k-noD.pt")
74
+ # weights=torch.load("infer/ft-mi-freeze-vocoder-flow-enc_q_1k.pt")
75
+ # weights=torch.load("infer/ft-mi-freeze-vocoder_true_1k.pt")
76
+ # weights=torch.load("infer/ft-mi-sim1k.pt")
77
+ weights = torch.load("infer/ft-mi-no_opt-no_dropout.pt")
78
+ print(net_g.load_state_dict(weights, strict=True))
79
+
80
+ net_g.eval().to(device)
81
+ net_g.half()
82
+
83
+
84
+ def get_f0(x, p_len, f0_up_key=0):
85
+ time_step = 160 / 16000 * 1000
86
+ f0_min = 50
87
+ f0_max = 1100
88
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
89
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
90
+
91
+ f0 = (
92
+ parselmouth.Sound(x, 16000)
93
+ .to_pitch_ac(
94
+ time_step=time_step / 1000,
95
+ voicing_threshold=0.6,
96
+ pitch_floor=f0_min,
97
+ pitch_ceiling=f0_max,
98
+ )
99
+ .selected_array["frequency"]
100
+ )
101
+
102
+ pad_size = (p_len - len(f0) + 1) // 2
103
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
104
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
105
+ f0 *= pow(2, f0_up_key / 12)
106
+ f0bak = f0.copy()
107
+
108
+ f0_mel = 1127 * np.log(1 + f0 / 700)
109
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
110
+ f0_mel_max - f0_mel_min
111
+ ) + 1
112
+ f0_mel[f0_mel <= 1] = 1
113
+ f0_mel[f0_mel > 255] = 255
114
+ # f0_mel[f0_mel > 188] = 188
115
+ f0_coarse = np.rint(f0_mel).astype(np.int)
116
+ return f0_coarse, f0bak
117
+
118
+
119
+ import faiss
120
+
121
+ index = faiss.read_index("infer/added_IVF512_Flat_mi_baseline_src_feat.index")
122
+ big_npy = np.load("infer/big_src_feature_mi.npy")
123
+ ta0 = ta1 = ta2 = 0
124
+ for idx, name in enumerate(
125
+ [
126
+ "Winter Flowers clip1.wav",
127
+ ]
128
+ ): ##
129
+ wav_path = "todo-songs/%s" % name #
130
+ f0_up_key = -2 #
131
+ audio, sampling_rate = sf.read(wav_path)
132
+ if len(audio.shape) > 1:
133
+ audio = librosa.to_mono(audio.transpose(1, 0))
134
+ if sampling_rate != 16000:
135
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
136
+
137
+ feats = torch.from_numpy(audio).float()
138
+ if feats.dim() == 2: # double channels
139
+ feats = feats.mean(-1)
140
+ assert feats.dim() == 1, feats.dim()
141
+ feats = feats.view(1, -1)
142
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
143
+ inputs = {
144
+ "source": feats.half().to(device),
145
+ "padding_mask": padding_mask.to(device),
146
+ "output_layer": 9, # layer 9
147
+ }
148
+ if torch.cuda.is_available():
149
+ torch.cuda.synchronize()
150
+ t0 = ttime()
151
+ with torch.no_grad():
152
+ logits = model.extract_features(**inputs)
153
+ feats = model.final_proj(logits[0])
154
+
155
+ ####Index optimization
156
+ npy = feats[0].cpu().numpy().astype("float32")
157
+ D, I = index.search(npy, 1)
158
+ feats = (
159
+ torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
160
+ )
161
+
162
+ feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
163
+ if torch.cuda.is_available():
164
+ torch.cuda.synchronize()
165
+ t1 = ttime()
166
+ # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
167
+ p_len = min(feats.shape[1], 10000) #
168
+ pitch, pitchf = get_f0(audio, p_len, f0_up_key)
169
+ p_len = min(feats.shape[1], 10000, pitch.shape[0]) # 太大了爆显存
170
+ if torch.cuda.is_available():
171
+ torch.cuda.synchronize()
172
+ t2 = ttime()
173
+ feats = feats[:, :p_len, :]
174
+ pitch = pitch[:p_len]
175
+ pitchf = pitchf[:p_len]
176
+ p_len = torch.LongTensor([p_len]).to(device)
177
+ pitch = torch.LongTensor(pitch).unsqueeze(0).to(device)
178
+ sid = torch.LongTensor([0]).to(device)
179
+ pitchf = torch.FloatTensor(pitchf).unsqueeze(0).to(device)
180
+ with torch.no_grad():
181
+ audio = (
182
+ net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
183
+ .data.cpu()
184
+ .float()
185
+ .numpy()
186
+ ) # nsf
187
+ if torch.cuda.is_available():
188
+ torch.cuda.synchronize()
189
+ t3 = ttime()
190
+ ta0 += t1 - t0
191
+ ta1 += t2 - t1
192
+ ta2 += t3 - t2
193
+ # wavfile.write("ft-mi_1k-index256-noD-%s.wav"%name, 40000, audio)##
194
+ # wavfile.write("ft-mi-freeze-vocoder-flow-enc_q_1k-%s.wav"%name, 40000, audio)##
195
+ # wavfile.write("ft-mi-sim1k-%s.wav"%name, 40000, audio)##
196
+ wavfile.write("ft-mi-no_opt-no_dropout-%s.wav" % name, 40000, audio) ##
197
+
198
+
199
+ print(ta0, ta1, ta2) #