Spaces:
Paused
Paused
update
Browse files
examples/sound_classification_by_lstm/step_6_export_onnx_model.py
CHANGED
|
@@ -14,14 +14,13 @@ import onnxruntime as ort
|
|
| 14 |
import torch
|
| 15 |
|
| 16 |
from toolbox.torch.utils.data.vocabulary import Vocabulary
|
| 17 |
-
from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import
|
| 18 |
|
| 19 |
|
| 20 |
def get_args():
|
| 21 |
parser = argparse.ArgumentParser()
|
| 22 |
parser.add_argument("--vocabulary_dir", default="file_dir/best/vocabulary", type=str)
|
| 23 |
parser.add_argument("--model_dir", default="file_dir/best", type=str)
|
| 24 |
-
parser.add_argument("--onnx_preprocess_file", default="preprocess.onnx", type=str)
|
| 25 |
parser.add_argument("--onnx_model_file", default="model.onnx", type=str)
|
| 26 |
|
| 27 |
args = parser.parse_args()
|
|
@@ -46,7 +45,6 @@ def logging_config():
|
|
| 46 |
def main():
|
| 47 |
args = get_args()
|
| 48 |
|
| 49 |
-
onnx_preprocess_file = Path(args.onnx_preprocess_file)
|
| 50 |
onnx_model_file = Path(args.onnx_model_file)
|
| 51 |
|
| 52 |
logger = logging_config()
|
|
@@ -57,13 +55,6 @@ def main():
|
|
| 57 |
logger.info("prepare vocabulary, model")
|
| 58 |
vocabulary = Vocabulary.from_files(args.vocabulary_dir)
|
| 59 |
|
| 60 |
-
model_preprocess_export = WaveClassifierPreprocessExport.from_pretrained(
|
| 61 |
-
pretrained_model_name_or_path=args.model_dir,
|
| 62 |
-
num_labels=vocabulary.get_vocab_size(namespace="labels")
|
| 63 |
-
)
|
| 64 |
-
model_preprocess_export.to(device)
|
| 65 |
-
model_preprocess_export.eval()
|
| 66 |
-
|
| 67 |
model_export = WaveClassifierExport.from_pretrained(
|
| 68 |
pretrained_model_name_or_path=args.model_dir,
|
| 69 |
num_labels=vocabulary.get_vocab_size(namespace="labels")
|
|
@@ -78,33 +69,8 @@ def main():
|
|
| 78 |
waveform = torch.unsqueeze(waveform, dim=0)
|
| 79 |
waveform = waveform.to(device)
|
| 80 |
|
| 81 |
-
logger.info("export onnx preprocess models")
|
| 82 |
-
torch.onnx.export(model_preprocess_export,
|
| 83 |
-
args=(waveform,),
|
| 84 |
-
f=onnx_preprocess_file.as_posix(),
|
| 85 |
-
input_names=["inputs"],
|
| 86 |
-
output_names=["spec"],
|
| 87 |
-
dynamic_axes={
|
| 88 |
-
"inputs": {1: "num_samples"},
|
| 89 |
-
}
|
| 90 |
-
)
|
| 91 |
-
|
| 92 |
-
preprocess_ort_session = ort.InferenceSession(onnx_preprocess_file.as_posix())
|
| 93 |
-
input_feed = {
|
| 94 |
-
"inputs": waveform.numpy(),
|
| 95 |
-
}
|
| 96 |
-
output_names = [
|
| 97 |
-
"spec",
|
| 98 |
-
]
|
| 99 |
-
outputs = preprocess_ort_session.run(output_names, input_feed)
|
| 100 |
-
spec = outputs[0]
|
| 101 |
-
# shape = [b, t, f]
|
| 102 |
-
|
| 103 |
logger.info("export onnx models")
|
| 104 |
|
| 105 |
-
inputs = spec
|
| 106 |
-
inputs = torch.tensor(inputs, dtype=torch.float32)
|
| 107 |
-
|
| 108 |
lstm_layer_param = model_export.config.lstm_layer_param
|
| 109 |
num_layers = lstm_layer_param["num_layers"]
|
| 110 |
hidden_size = lstm_layer_param["hidden_size"]
|
|
@@ -112,7 +78,7 @@ def main():
|
|
| 112 |
c = torch.rand(size=(num_layers, 1, hidden_size), dtype=torch.float32)
|
| 113 |
|
| 114 |
torch.onnx.export(model_export,
|
| 115 |
-
args=(
|
| 116 |
f=onnx_model_file.as_posix(),
|
| 117 |
input_names=["inputs", "h", "c"],
|
| 118 |
output_names=[
|
|
@@ -125,7 +91,7 @@ def main():
|
|
| 125 |
|
| 126 |
model_ort_session = ort.InferenceSession(onnx_model_file.as_posix())
|
| 127 |
input_feed = {
|
| 128 |
-
"inputs":
|
| 129 |
"h": h.numpy(),
|
| 130 |
"c": c.numpy(),
|
| 131 |
}
|
|
|
|
| 14 |
import torch
|
| 15 |
|
| 16 |
from toolbox.torch.utils.data.vocabulary import Vocabulary
|
| 17 |
+
from toolbox.torchaudio.models.lstm_audio_classifier.modeling_lstm_audio_classifier import WaveClassifierExport
|
| 18 |
|
| 19 |
|
| 20 |
def get_args():
|
| 21 |
parser = argparse.ArgumentParser()
|
| 22 |
parser.add_argument("--vocabulary_dir", default="file_dir/best/vocabulary", type=str)
|
| 23 |
parser.add_argument("--model_dir", default="file_dir/best", type=str)
|
|
|
|
| 24 |
parser.add_argument("--onnx_model_file", default="model.onnx", type=str)
|
| 25 |
|
| 26 |
args = parser.parse_args()
|
|
|
|
| 45 |
def main():
|
| 46 |
args = get_args()
|
| 47 |
|
|
|
|
| 48 |
onnx_model_file = Path(args.onnx_model_file)
|
| 49 |
|
| 50 |
logger = logging_config()
|
|
|
|
| 55 |
logger.info("prepare vocabulary, model")
|
| 56 |
vocabulary = Vocabulary.from_files(args.vocabulary_dir)
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
model_export = WaveClassifierExport.from_pretrained(
|
| 59 |
pretrained_model_name_or_path=args.model_dir,
|
| 60 |
num_labels=vocabulary.get_vocab_size(namespace="labels")
|
|
|
|
| 69 |
waveform = torch.unsqueeze(waveform, dim=0)
|
| 70 |
waveform = waveform.to(device)
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
logger.info("export onnx models")
|
| 73 |
|
|
|
|
|
|
|
|
|
|
| 74 |
lstm_layer_param = model_export.config.lstm_layer_param
|
| 75 |
num_layers = lstm_layer_param["num_layers"]
|
| 76 |
hidden_size = lstm_layer_param["hidden_size"]
|
|
|
|
| 78 |
c = torch.rand(size=(num_layers, 1, hidden_size), dtype=torch.float32)
|
| 79 |
|
| 80 |
torch.onnx.export(model_export,
|
| 81 |
+
args=(waveform, h, c),
|
| 82 |
f=onnx_model_file.as_posix(),
|
| 83 |
input_names=["inputs", "h", "c"],
|
| 84 |
output_names=[
|
|
|
|
| 91 |
|
| 92 |
model_ort_session = ort.InferenceSession(onnx_model_file.as_posix())
|
| 93 |
input_feed = {
|
| 94 |
+
"inputs": waveform.numpy(),
|
| 95 |
"h": h.numpy(),
|
| 96 |
"c": c.numpy(),
|
| 97 |
}
|
examples/sound_classification_by_lstm/step_9_evaluation_onnx_model.py
CHANGED
|
@@ -40,7 +40,6 @@ def get_args():
|
|
| 40 |
parser.add_argument("--dataset", default="evaluation.xlsx", type=str)
|
| 41 |
parser.add_argument("--vocabulary_dir", default="vocabulary", type=str)
|
| 42 |
parser.add_argument("--model_dir", default="best", type=str)
|
| 43 |
-
parser.add_argument("--onnx_preprocess_file", default="preprocess.onnx", type=str)
|
| 44 |
parser.add_argument("--onnx_model_file", default="model.onnx", type=str)
|
| 45 |
parser.add_argument("--output_file", default="evaluation_onnx.xlsx", type=str)
|
| 46 |
# parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", type=str)
|
|
@@ -70,23 +69,17 @@ def main():
|
|
| 70 |
f_zip.extractall(path=out_root)
|
| 71 |
tgt_path = out_root / model_file.stem
|
| 72 |
config_file = tgt_path / "config.yaml"
|
| 73 |
-
onnx_preprocess_file = tgt_path / "preprocess.onnx"
|
| 74 |
onnx_model_file = tgt_path / "model.onnx"
|
| 75 |
vocab_path = tgt_path / "vocabulary"
|
| 76 |
evaluation_file = tgt_path / "evaluation.xlsx"
|
| 77 |
else:
|
| 78 |
config_file = model_dir / "config.yaml"
|
| 79 |
-
onnx_preprocess_file = Path(args.onnx_preprocess_file)
|
| 80 |
onnx_model_file = Path(args.onnx_model_file)
|
| 81 |
vocab_path = Path(args.vocabulary_dir)
|
| 82 |
evaluation_file = Path(args.dataset)
|
| 83 |
|
| 84 |
config = WaveClassifierConfig.from_pretrained(config_file.as_posix())
|
| 85 |
-
|
| 86 |
-
onnx_preprocess_file.as_posix(),
|
| 87 |
-
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| 88 |
-
)
|
| 89 |
-
model_ort_session = ort.InferenceSession(
|
| 90 |
onnx_model_file.as_posix(),
|
| 91 |
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| 92 |
)
|
|
@@ -132,25 +125,16 @@ def main():
|
|
| 132 |
end = begin + 4000
|
| 133 |
|
| 134 |
waveform_ = waveform[:, begin: end]
|
| 135 |
-
input_feed = {
|
| 136 |
-
"inputs": waveform_.numpy(),
|
| 137 |
-
}
|
| 138 |
-
output_names = [
|
| 139 |
-
"spec",
|
| 140 |
-
]
|
| 141 |
-
outputs = preprocess_ort_session.run(output_names, input_feed)
|
| 142 |
-
# shape = [b, t, f]
|
| 143 |
-
inputs: np.ndarray = outputs[0]
|
| 144 |
|
| 145 |
input_feed = {
|
| 146 |
-
"inputs":
|
| 147 |
"h": h,
|
| 148 |
"c": c,
|
| 149 |
}
|
| 150 |
output_names = [
|
| 151 |
"logits", "new_h", "new_c"
|
| 152 |
]
|
| 153 |
-
logits, new_h, new_c =
|
| 154 |
# print(f"logits: {logits.shape}")
|
| 155 |
# print(f"new_h: {new_h.shape}")
|
| 156 |
# print(f"new_c: {new_c.shape}")
|
|
|
|
| 40 |
parser.add_argument("--dataset", default="evaluation.xlsx", type=str)
|
| 41 |
parser.add_argument("--vocabulary_dir", default="vocabulary", type=str)
|
| 42 |
parser.add_argument("--model_dir", default="best", type=str)
|
|
|
|
| 43 |
parser.add_argument("--onnx_model_file", default="model.onnx", type=str)
|
| 44 |
parser.add_argument("--output_file", default="evaluation_onnx.xlsx", type=str)
|
| 45 |
# parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", type=str)
|
|
|
|
| 69 |
f_zip.extractall(path=out_root)
|
| 70 |
tgt_path = out_root / model_file.stem
|
| 71 |
config_file = tgt_path / "config.yaml"
|
|
|
|
| 72 |
onnx_model_file = tgt_path / "model.onnx"
|
| 73 |
vocab_path = tgt_path / "vocabulary"
|
| 74 |
evaluation_file = tgt_path / "evaluation.xlsx"
|
| 75 |
else:
|
| 76 |
config_file = model_dir / "config.yaml"
|
|
|
|
| 77 |
onnx_model_file = Path(args.onnx_model_file)
|
| 78 |
vocab_path = Path(args.vocabulary_dir)
|
| 79 |
evaluation_file = Path(args.dataset)
|
| 80 |
|
| 81 |
config = WaveClassifierConfig.from_pretrained(config_file.as_posix())
|
| 82 |
+
ort_session = ort.InferenceSession(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
onnx_model_file.as_posix(),
|
| 84 |
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
| 85 |
)
|
|
|
|
| 125 |
end = begin + 4000
|
| 126 |
|
| 127 |
waveform_ = waveform[:, begin: end]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
input_feed = {
|
| 130 |
+
"inputs": waveform_.numpy(),
|
| 131 |
"h": h,
|
| 132 |
"c": c,
|
| 133 |
}
|
| 134 |
output_names = [
|
| 135 |
"logits", "new_h", "new_c"
|
| 136 |
]
|
| 137 |
+
logits, new_h, new_c = ort_session.run(output_names, input_feed)
|
| 138 |
# print(f"logits: {logits.shape}")
|
| 139 |
# print(f"new_h: {new_h.shape}")
|
| 140 |
# print(f"new_c: {new_c.shape}")
|
toolbox/torchaudio/models/lstm_audio_classifier/modeling_lstm_audio_classifier.py
CHANGED
|
@@ -308,12 +308,14 @@ class WaveClassifierPretrainedModel(WaveClassifier):
|
|
| 308 |
return save_directory
|
| 309 |
|
| 310 |
|
| 311 |
-
class
|
| 312 |
def __init__(self, config: WaveClassifierConfig):
|
| 313 |
-
super(
|
| 314 |
|
| 315 |
def forward(self,
|
| 316 |
inputs: torch.Tensor,
|
|
|
|
|
|
|
| 317 |
):
|
| 318 |
# x: [b, num_samples]
|
| 319 |
x = inputs
|
|
@@ -327,22 +329,8 @@ class WaveClassifierPreprocessExport(WaveClassifierPretrainedModel):
|
|
| 327 |
# shape = [b, t, mel_bins]
|
| 328 |
spec = x + 1e-6
|
| 329 |
spec = spec.log()
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
return spec
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
class WaveClassifierExport(WaveClassifierPretrainedModel):
|
| 336 |
-
def __init__(self, config: WaveClassifierConfig):
|
| 337 |
-
super(WaveClassifierExport, self).__init__(config=config)
|
| 338 |
-
|
| 339 |
-
def forward(self,
|
| 340 |
-
inputs: torch.Tensor,
|
| 341 |
-
h: torch.Tensor = None,
|
| 342 |
-
c: torch.Tensor = None,
|
| 343 |
-
):
|
| 344 |
-
# inputs shape = [b, t, f]
|
| 345 |
-
features, h, c = self.wave_encoder.lstm_layer.forward(inputs, h=h, c=c)
|
| 346 |
# features: shape, [b, t, hidden_size]
|
| 347 |
# h: shape, [num_layers, b, hidden_size]
|
| 348 |
# c: shape, [num_layers, b, hidden_size]
|
|
@@ -358,10 +346,8 @@ class WaveClassifierExport(WaveClassifierPretrainedModel):
|
|
| 358 |
def main():
|
| 359 |
config = WaveClassifierConfig.from_pretrained("examples/lstm_classifier.yaml")
|
| 360 |
model = WaveClassifierPretrainedModel(config)
|
| 361 |
-
model_preprocess = WaveClassifierPreprocessExport(config)
|
| 362 |
model_export = WaveClassifierExport(config)
|
| 363 |
model.eval()
|
| 364 |
-
model_preprocess.eval()
|
| 365 |
model_export.eval()
|
| 366 |
|
| 367 |
inputs = torch.rand(size=(1, 16000), dtype=torch.float32)
|
|
@@ -369,8 +355,7 @@ def main():
|
|
| 369 |
logits = model.forward(inputs)
|
| 370 |
print(logits)
|
| 371 |
|
| 372 |
-
|
| 373 |
-
logits, h, c = model_export.forward(spec)
|
| 374 |
|
| 375 |
return
|
| 376 |
|
|
|
|
| 308 |
return save_directory
|
| 309 |
|
| 310 |
|
| 311 |
+
class WaveClassifierExport(WaveClassifierPretrainedModel):
|
| 312 |
def __init__(self, config: WaveClassifierConfig):
|
| 313 |
+
super(WaveClassifierExport, self).__init__(config=config)
|
| 314 |
|
| 315 |
def forward(self,
|
| 316 |
inputs: torch.Tensor,
|
| 317 |
+
h: torch.Tensor = None,
|
| 318 |
+
c: torch.Tensor = None,
|
| 319 |
):
|
| 320 |
# x: [b, num_samples]
|
| 321 |
x = inputs
|
|
|
|
| 329 |
# shape = [b, t, mel_bins]
|
| 330 |
spec = x + 1e-6
|
| 331 |
spec = spec.log()
|
| 332 |
+
# spec shape = [b, t, f]
|
| 333 |
+
features, h, c = self.wave_encoder.lstm_layer.forward(spec, h=h, c=c)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
# features: shape, [b, t, hidden_size]
|
| 335 |
# h: shape, [num_layers, b, hidden_size]
|
| 336 |
# c: shape, [num_layers, b, hidden_size]
|
|
|
|
| 346 |
def main():
|
| 347 |
config = WaveClassifierConfig.from_pretrained("examples/lstm_classifier.yaml")
|
| 348 |
model = WaveClassifierPretrainedModel(config)
|
|
|
|
| 349 |
model_export = WaveClassifierExport(config)
|
| 350 |
model.eval()
|
|
|
|
| 351 |
model_export.eval()
|
| 352 |
|
| 353 |
inputs = torch.rand(size=(1, 16000), dtype=torch.float32)
|
|
|
|
| 355 |
logits = model.forward(inputs)
|
| 356 |
print(logits)
|
| 357 |
|
| 358 |
+
logits, h, c = model_export.forward(inputs)
|
|
|
|
| 359 |
|
| 360 |
return
|
| 361 |
|
voicemail-es-mx-2-l3-ch64-lstm.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4875557ee77aa194cba21c84bb577383ae5d6aab53a424f1253bb84e2253049e
|
| 3 |
+
size 4528606
|