yuekai commited on
Commit
139ca63
·
verified ·
1 Parent(s): 1e001c1

Upload folder using huggingface_hub

Browse files
export_onnx.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 Antgroup Inc (authors: Zhoubofan, hexisyztem@icloud.com)
2
+ # Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from __future__ import print_function
17
+
18
+ import argparse
19
+ import logging
20
+ logging.getLogger('matplotlib').setLevel(logging.WARNING)
21
+ import os
22
+ import sys
23
+ import onnxruntime
24
+ import random
25
+ import torch
26
+ from tqdm import tqdm
27
+ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
28
+ sys.path.append('{}/../..'.format(ROOT_DIR))
29
+ sys.path.append('{}/../../third_party/Matcha-TTS'.format(ROOT_DIR))
30
+ from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
31
+ from cosyvoice.utils.file_utils import logging
32
+
33
+
34
+ def get_dummy_input(batch_size, seq_len, out_channels, device):
35
+ x = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
36
+ mask = torch.ones((batch_size, 1, seq_len), dtype=torch.float32, device=device)
37
+ mu = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
38
+ t = torch.rand((batch_size), dtype=torch.float32, device=device)
39
+ spks = torch.rand((batch_size, out_channels), dtype=torch.float32, device=device)
40
+ cond = torch.rand((batch_size, out_channels, seq_len), dtype=torch.float32, device=device)
41
+ return x, mask, mu, t, spks, cond
42
+
43
+
44
+ def get_args():
45
+ parser = argparse.ArgumentParser(description='export your model for deployment')
46
+ parser.add_argument('--model_dir',
47
+ type=str,
48
+ default='pretrained_models/CosyVoice-300M',
49
+ help='local path')
50
+ parser.add_argument('--onnx_model',
51
+ type=str,
52
+ default='flow.decoder.estimator.fp32.onnx',
53
+ help='onnx model name')
54
+ args = parser.parse_args()
55
+ print(args)
56
+ return args
57
+
58
+
59
+ @torch.no_grad()
60
+ def main():
61
+ args = get_args()
62
+ logging.basicConfig(level=logging.DEBUG,
63
+ format='%(asctime)s %(levelname)s %(message)s')
64
+
65
+ try:
66
+ model = CosyVoice(args.model_dir)
67
+ except Exception:
68
+ try:
69
+ model = CosyVoice2(args.model_dir)
70
+ except Exception:
71
+ raise TypeError('no valid model_type!')
72
+
73
+ # 1. export flow decoder estimator
74
+ estimator = model.model.flow.decoder.estimator
75
+ estimator.eval()
76
+
77
+ device = model.model.device
78
+ batch_size, seq_len = 2, 256
79
+ out_channels = model.model.flow.decoder.estimator.out_channels
80
+ x, mask, mu, t, spks, cond = get_dummy_input(batch_size, seq_len, out_channels, device)
81
+ torch.onnx.export(
82
+ estimator,
83
+ (x, mask, mu, t, spks, cond),
84
+ f'{args.model_dir}/{args.onnx_model}',
85
+ export_params=True,
86
+ opset_version=18,
87
+ do_constant_folding=True,
88
+ input_names=['x', 'mask', 'mu', 't', 'spks', 'cond'],
89
+ output_names=['estimator_out'],
90
+ dynamic_axes={
91
+ 'x': {0: 'batch_size', 2: 'seq_len'},
92
+ 'mask': {0: 'batch_size', 2: 'seq_len'},
93
+ 'mu': {0: 'batch_size', 2: 'seq_len'},
94
+ 'cond': {0: 'batch_size', 2: 'seq_len'},
95
+ 't': {0: 'batch_size'},
96
+ 'spks': {0: 'batch_size'},
97
+ 'estimator_out': {0: 'batch_size', 2: 'seq_len'},
98
+
99
+ }
100
+ )
101
+
102
+ # 2. test computation consistency
103
+ option = onnxruntime.SessionOptions()
104
+ option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
105
+ option.intra_op_num_threads = 1
106
+ providers = ['CUDAExecutionProvider' if torch.cuda.is_available() else 'CPUExecutionProvider']
107
+ estimator_onnx = onnxruntime.InferenceSession(f'{args.model_dir}/{args.onnx_model}',
108
+ sess_options=option, providers=providers)
109
+
110
+ for _ in tqdm(range(10)):
111
+ x, mask, mu, t, spks, cond = get_dummy_input(batch_size, random.randint(16, 512), out_channels, device)
112
+ output_pytorch = estimator(x, mask, mu, t, spks, cond)
113
+ ort_inputs = {
114
+ 'x': x.cpu().numpy(),
115
+ 'mask': mask.cpu().numpy(),
116
+ 'mu': mu.cpu().numpy(),
117
+ 't': t.cpu().numpy(),
118
+ 'spks': spks.cpu().numpy(),
119
+ 'cond': cond.cpu().numpy()
120
+ }
121
+ output_onnx = estimator_onnx.run(None, ort_inputs)[0]
122
+ torch.testing.assert_allclose(output_pytorch, torch.from_numpy(output_onnx).to(device), rtol=1e-2, atol=1e-4)
123
+ logging.info('successfully export estimator')
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
flow.decoder.estimator.fp32.dynamic_batch.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2659223f18194d79ad3b3ce3c2d4e355de108ae6e9402e3583da03d0acc413de
3
+ size 286312394
run.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ export CUDA_VISIBLE_DEVICES=0
3
+ cosyvoice_path=/workspace/CosyVoice
4
+ export PYTHONPATH=${cosyvoice_path}:$PYTHONPATH
5
+ export PYTHONPATH=${cosyvoice_path}/third_party/Matcha-TTS:$PYTHONPATH
6
+
7
+ model_scope_model_local_dir=./CosyVoice2-0.5B
8
+
9
+
10
+
11
+ if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
12
+ # pip install wetext inflect modelscope
13
+ python3 export_onnx.py --model_dir $model_scope_model_local_dir --onnx_model flow.decoder.estimator.fp32.dynamic_batch.onnx
14
+
15
+ fi