Spaces:
Runtime error
Runtime error
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import warnings | |
| from argparse import ArgumentParser | |
| from os import listdir, makedirs | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Tuple | |
| from packaging.version import Version, parse | |
| from transformers.pipelines import Pipeline, pipeline | |
| from transformers.tokenization_utils import BatchEncoding | |
| from transformers.utils import ModelOutput, is_tf_available, is_torch_available | |
| # This is the minimal required version to | |
| # support some ONNX Runtime features | |
| ORT_QUANTIZE_MINIMUM_VERSION = parse("1.4.0") | |
| SUPPORTED_PIPELINES = [ | |
| "feature-extraction", | |
| "ner", | |
| "sentiment-analysis", | |
| "fill-mask", | |
| "question-answering", | |
| "text-generation", | |
| "translation_en_to_fr", | |
| "translation_en_to_de", | |
| "translation_en_to_ro", | |
| ] | |
| class OnnxConverterArgumentParser(ArgumentParser): | |
| """ | |
| Wraps all the script arguments supported to export transformers models to ONNX IR | |
| """ | |
| def __init__(self): | |
| super().__init__("ONNX Converter") | |
| self.add_argument( | |
| "--pipeline", | |
| type=str, | |
| choices=SUPPORTED_PIPELINES, | |
| default="feature-extraction", | |
| ) | |
| self.add_argument( | |
| "--model", | |
| type=str, | |
| required=True, | |
| help="Model's id or path (ex: bert-base-cased)", | |
| ) | |
| self.add_argument("--tokenizer", type=str, help="Tokenizer's id or path (ex: bert-base-cased)") | |
| self.add_argument( | |
| "--framework", | |
| type=str, | |
| choices=["pt", "tf"], | |
| help="Framework for loading the model", | |
| ) | |
| self.add_argument("--opset", type=int, default=11, help="ONNX opset to use") | |
| self.add_argument( | |
| "--check-loading", | |
| action="store_true", | |
| help="Check ONNX is able to load the model", | |
| ) | |
| self.add_argument( | |
| "--use-external-format", | |
| action="store_true", | |
| help="Allow exporting model >= than 2Gb", | |
| ) | |
| self.add_argument( | |
| "--quantize", | |
| action="store_true", | |
| help="Quantize the neural network to be run with int8", | |
| ) | |
| self.add_argument("output") | |
| def generate_identified_filename(filename: Path, identifier: str) -> Path: | |
| """ | |
| Append a string-identifier at the end (before the extension, if any) to the provided filepath | |
| Args: | |
| filename: pathlib.Path The actual path object we would like to add an identifier suffix | |
| identifier: The suffix to add | |
| Returns: String with concatenated identifier at the end of the filename | |
| """ | |
| return filename.parent.joinpath(filename.stem + identifier).with_suffix(filename.suffix) | |
| def check_onnxruntime_requirements(minimum_version: Version): | |
| """ | |
| Check onnxruntime is installed and if the installed version match is recent enough | |
| Raises: | |
| ImportError: If onnxruntime is not installed or too old version is found | |
| """ | |
| try: | |
| import onnxruntime | |
| # Parse the version of the installed onnxruntime | |
| ort_version = parse(onnxruntime.__version__) | |
| # We require 1.4.0 minimum | |
| if ort_version < ORT_QUANTIZE_MINIMUM_VERSION: | |
| raise ImportError( | |
| f"We found an older version of onnxruntime ({onnxruntime.__version__}) " | |
| f"but we require onnxruntime to be >= {minimum_version} to enable all the conversions options.\n" | |
| "Please update onnxruntime by running `pip install --upgrade onnxruntime`" | |
| ) | |
| except ImportError: | |
| raise ImportError( | |
| "onnxruntime doesn't seem to be currently installed. " | |
| "Please install the onnxruntime by running `pip install onnxruntime`" | |
| " and relaunch the conversion." | |
| ) | |
| def ensure_valid_input(model, tokens, input_names): | |
| """ | |
| Ensure inputs are presented in the correct order, without any Non | |
| Args: | |
| model: The model used to forward the input data | |
| tokens: BatchEncoding holding the input data | |
| input_names: The name of the inputs | |
| Returns: Tuple | |
| """ | |
| print("Ensuring inputs are in correct order") | |
| model_args_name = model.forward.__code__.co_varnames | |
| model_args, ordered_input_names = [], [] | |
| for arg_name in model_args_name[1:]: # start at index 1 to skip "self" argument | |
| if arg_name in input_names: | |
| ordered_input_names.append(arg_name) | |
| model_args.append(tokens[arg_name]) | |
| else: | |
| print(f"{arg_name} is not present in the generated input list.") | |
| break | |
| print(f"Generated inputs order: {ordered_input_names}") | |
| return ordered_input_names, tuple(model_args) | |
| def infer_shapes(nlp: Pipeline, framework: str) -> Tuple[List[str], List[str], Dict, BatchEncoding]: | |
| """ | |
| Attempt to infer the static vs dynamic axes for each input and output tensors for a specific model | |
| Args: | |
| nlp: The pipeline object holding the model to be exported | |
| framework: The framework identifier to dispatch to the correct inference scheme (pt/tf) | |
| Returns: | |
| - List of the inferred input variable names | |
| - List of the inferred output variable names | |
| - Dictionary with input/output variables names as key and shape tensor as value | |
| - a BatchEncoding reference which was used to infer all the above information | |
| """ | |
| def build_shape_dict(name: str, tensor, is_input: bool, seq_len: int): | |
| if isinstance(tensor, (tuple, list)): | |
| return [build_shape_dict(name, t, is_input, seq_len) for t in tensor] | |
| else: | |
| # Let's assume batch is the first axis with only 1 element (~~ might not be always true ...) | |
| axes = {[axis for axis, numel in enumerate(tensor.shape) if numel == 1][0]: "batch"} | |
| if is_input: | |
| if len(tensor.shape) == 2: | |
| axes[1] = "sequence" | |
| else: | |
| raise ValueError(f"Unable to infer tensor axes ({len(tensor.shape)})") | |
| else: | |
| seq_axes = [dim for dim, shape in enumerate(tensor.shape) if shape == seq_len] | |
| axes.update({dim: "sequence" for dim in seq_axes}) | |
| print(f"Found {'input' if is_input else 'output'} {name} with shape: {axes}") | |
| return axes | |
| tokens = nlp.tokenizer("This is a sample output", return_tensors=framework) | |
| seq_len = tokens.input_ids.shape[-1] | |
| outputs = nlp.model(**tokens) if framework == "pt" else nlp.model(tokens) | |
| if isinstance(outputs, ModelOutput): | |
| outputs = outputs.to_tuple() | |
| if not isinstance(outputs, (list, tuple)): | |
| outputs = (outputs,) | |
| # Generate input names & axes | |
| input_vars = list(tokens.keys()) | |
| input_dynamic_axes = {k: build_shape_dict(k, v, True, seq_len) for k, v in tokens.items()} | |
| # flatten potentially grouped outputs (past for gpt2, attentions) | |
| outputs_flat = [] | |
| for output in outputs: | |
| if isinstance(output, (tuple, list)): | |
| outputs_flat.extend(output) | |
| else: | |
| outputs_flat.append(output) | |
| # Generate output names & axes | |
| output_names = [f"output_{i}" for i in range(len(outputs_flat))] | |
| output_dynamic_axes = {k: build_shape_dict(k, v, False, seq_len) for k, v in zip(output_names, outputs_flat)} | |
| # Create the aggregated axes representation | |
| dynamic_axes = dict(input_dynamic_axes, **output_dynamic_axes) | |
| return input_vars, output_names, dynamic_axes, tokens | |
| def load_graph_from_args( | |
| pipeline_name: str, framework: str, model: str, tokenizer: Optional[str] = None, **models_kwargs | |
| ) -> Pipeline: | |
| """ | |
| Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model | |
| Args: | |
| pipeline_name: The kind of pipeline to use (ner, question-answering, etc.) | |
| framework: The actual model to convert the pipeline from ("pt" or "tf") | |
| model: The model name which will be loaded by the pipeline | |
| tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value | |
| Returns: Pipeline object | |
| """ | |
| # If no tokenizer provided | |
| if tokenizer is None: | |
| tokenizer = model | |
| # Check the wanted framework is available | |
| if framework == "pt" and not is_torch_available(): | |
| raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") | |
| if framework == "tf" and not is_tf_available(): | |
| raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") | |
| print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})") | |
| # Allocate tokenizer and model | |
| return pipeline(pipeline_name, model=model, tokenizer=tokenizer, framework=framework, model_kwargs=models_kwargs) | |
| def convert_pytorch(nlp: Pipeline, opset: int, output: Path, use_external_format: bool): | |
| """ | |
| Export a PyTorch backed pipeline to ONNX Intermediate Representation (IR | |
| Args: | |
| nlp: The pipeline to be exported | |
| opset: The actual version of the ONNX operator set to use | |
| output: Path where will be stored the generated ONNX model | |
| use_external_format: Split the model definition from its parameters to allow model bigger than 2GB | |
| Returns: | |
| """ | |
| if not is_torch_available(): | |
| raise Exception("Cannot convert because PyTorch is not installed. Please install torch first.") | |
| import torch | |
| from torch.onnx import export | |
| from transformers.pytorch_utils import is_torch_less_than_1_11 | |
| print(f"Using framework PyTorch: {torch.__version__}") | |
| with torch.no_grad(): | |
| input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "pt") | |
| ordered_input_names, model_args = ensure_valid_input(nlp.model, tokens, input_names) | |
| # PyTorch deprecated the `enable_onnx_checker` and `use_external_data_format` arguments in v1.11, | |
| # so we check the torch version for backwards compatibility | |
| if is_torch_less_than_1_11: | |
| export( | |
| nlp.model, | |
| model_args, | |
| f=output.as_posix(), | |
| input_names=ordered_input_names, | |
| output_names=output_names, | |
| dynamic_axes=dynamic_axes, | |
| do_constant_folding=True, | |
| use_external_data_format=use_external_format, | |
| enable_onnx_checker=True, | |
| opset_version=opset, | |
| ) | |
| else: | |
| export( | |
| nlp.model, | |
| model_args, | |
| f=output.as_posix(), | |
| input_names=ordered_input_names, | |
| output_names=output_names, | |
| dynamic_axes=dynamic_axes, | |
| do_constant_folding=True, | |
| opset_version=opset, | |
| ) | |
| def convert_tensorflow(nlp: Pipeline, opset: int, output: Path): | |
| """ | |
| Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR) | |
| Args: | |
| nlp: The pipeline to be exported | |
| opset: The actual version of the ONNX operator set to use | |
| output: Path where will be stored the generated ONNX model | |
| Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow | |
| """ | |
| if not is_tf_available(): | |
| raise Exception("Cannot convert because TF is not installed. Please install tensorflow first.") | |
| print("/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\") | |
| try: | |
| import tensorflow as tf | |
| import tf2onnx | |
| from tf2onnx import __version__ as t2ov | |
| print(f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}") | |
| # Build | |
| input_names, output_names, dynamic_axes, tokens = infer_shapes(nlp, "tf") | |
| # Forward | |
| nlp.model.predict(tokens.data) | |
| input_signature = [tf.TensorSpec.from_tensor(tensor, name=key) for key, tensor in tokens.items()] | |
| model_proto, _ = tf2onnx.convert.from_keras( | |
| nlp.model, input_signature, opset=opset, output_path=output.as_posix() | |
| ) | |
| except ImportError as e: | |
| raise Exception( | |
| f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}" | |
| ) | |
| def convert( | |
| framework: str, | |
| model: str, | |
| output: Path, | |
| opset: int, | |
| tokenizer: Optional[str] = None, | |
| use_external_format: bool = False, | |
| pipeline_name: str = "feature-extraction", | |
| **model_kwargs, | |
| ): | |
| """ | |
| Convert the pipeline object to the ONNX Intermediate Representation (IR) format | |
| Args: | |
| framework: The framework the pipeline is backed by ("pt" or "tf") | |
| model: The name of the model to load for the pipeline | |
| output: The path where the ONNX graph will be stored | |
| opset: The actual version of the ONNX operator set to use | |
| tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided | |
| use_external_format: | |
| Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) | |
| pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) | |
| model_kwargs: Keyword arguments to be forwarded to the model constructor | |
| Returns: | |
| """ | |
| warnings.warn( | |
| "The `transformers.convert_graph_to_onnx` package is deprecated and will be removed in version 5 of" | |
| " Transformers", | |
| FutureWarning, | |
| ) | |
| print(f"ONNX opset version set to: {opset}") | |
| # Load the pipeline | |
| nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer, **model_kwargs) | |
| if not output.parent.exists(): | |
| print(f"Creating folder {output.parent}") | |
| makedirs(output.parent.as_posix()) | |
| elif len(listdir(output.parent.as_posix())) > 0: | |
| raise Exception(f"Folder {output.parent.as_posix()} is not empty, aborting conversion") | |
| # Export the graph | |
| if framework == "pt": | |
| convert_pytorch(nlp, opset, output, use_external_format) | |
| else: | |
| convert_tensorflow(nlp, opset, output) | |
| def optimize(onnx_model_path: Path) -> Path: | |
| """ | |
| Load the model at the specified path and let onnxruntime look at transformations on the graph to enable all the | |
| optimizations possible | |
| Args: | |
| onnx_model_path: filepath where the model binary description is stored | |
| Returns: Path where the optimized model binary description has been saved | |
| """ | |
| from onnxruntime import InferenceSession, SessionOptions | |
| # Generate model name with suffix "optimized" | |
| opt_model_path = generate_identified_filename(onnx_model_path, "-optimized") | |
| sess_option = SessionOptions() | |
| sess_option.optimized_model_filepath = opt_model_path.as_posix() | |
| _ = InferenceSession(onnx_model_path.as_posix(), sess_option) | |
| print(f"Optimized model has been written at {opt_model_path}: \N{heavy check mark}") | |
| print("/!\\ Optimized model contains hardware specific operators which might not be portable. /!\\") | |
| return opt_model_path | |
| def quantize(onnx_model_path: Path) -> Path: | |
| """ | |
| Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU | |
| Args: | |
| onnx_model_path: Path to location the exported ONNX model is stored | |
| Returns: The Path generated for the quantized | |
| """ | |
| import onnx | |
| import onnxruntime | |
| from onnx.onnx_pb import ModelProto | |
| from onnxruntime.quantization import QuantizationMode | |
| from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer | |
| from onnxruntime.quantization.registry import IntegerOpsRegistry | |
| # Load the ONNX model | |
| onnx_model = onnx.load(onnx_model_path.as_posix()) | |
| if parse(onnx.__version__) < parse("1.5.0"): | |
| print( | |
| "Models larger than 2GB will fail to quantize due to protobuf constraint.\n" | |
| "Please upgrade to onnxruntime >= 1.5.0." | |
| ) | |
| # Copy it | |
| copy_model = ModelProto() | |
| copy_model.CopyFrom(onnx_model) | |
| # Construct quantizer | |
| # onnxruntime renamed input_qType to activation_qType in v1.13.1, so we | |
| # check the onnxruntime version to ensure backward compatibility. | |
| # See also: https://github.com/microsoft/onnxruntime/pull/12873 | |
| if parse(onnxruntime.__version__) < parse("1.13.1"): | |
| quantizer = ONNXQuantizer( | |
| model=copy_model, | |
| per_channel=False, | |
| reduce_range=False, | |
| mode=QuantizationMode.IntegerOps, | |
| static=False, | |
| weight_qType=True, | |
| input_qType=False, | |
| tensors_range=None, | |
| nodes_to_quantize=None, | |
| nodes_to_exclude=None, | |
| op_types_to_quantize=list(IntegerOpsRegistry), | |
| ) | |
| else: | |
| quantizer = ONNXQuantizer( | |
| model=copy_model, | |
| per_channel=False, | |
| reduce_range=False, | |
| mode=QuantizationMode.IntegerOps, | |
| static=False, | |
| weight_qType=True, | |
| activation_qType=False, | |
| tensors_range=None, | |
| nodes_to_quantize=None, | |
| nodes_to_exclude=None, | |
| op_types_to_quantize=list(IntegerOpsRegistry), | |
| ) | |
| # Quantize and export | |
| quantizer.quantize_model() | |
| # Append "-quantized" at the end of the model's name | |
| quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") | |
| # Save model | |
| print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") | |
| onnx.save_model(quantizer.model.model, quantized_model_path.as_posix()) | |
| return quantized_model_path | |
| def verify(path: Path): | |
| from onnxruntime import InferenceSession, SessionOptions | |
| from onnxruntime.capi.onnxruntime_pybind11_state import RuntimeException | |
| print(f"Checking ONNX model loading from: {path} ...") | |
| try: | |
| onnx_options = SessionOptions() | |
| _ = InferenceSession(path.as_posix(), onnx_options, providers=["CPUExecutionProvider"]) | |
| print(f"Model {path} correctly loaded: \N{heavy check mark}") | |
| except RuntimeException as re: | |
| print(f"Error while loading the model {re}: \N{heavy ballot x}") | |
| if __name__ == "__main__": | |
| parser = OnnxConverterArgumentParser() | |
| args = parser.parse_args() | |
| # Make sure output is absolute path | |
| args.output = Path(args.output).absolute() | |
| try: | |
| print("\n====== Converting model to ONNX ======") | |
| # Convert | |
| convert( | |
| args.framework, | |
| args.model, | |
| args.output, | |
| args.opset, | |
| args.tokenizer, | |
| args.use_external_format, | |
| args.pipeline, | |
| ) | |
| if args.quantize: | |
| # Ensure requirements for quantization on onnxruntime is met | |
| check_onnxruntime_requirements(ORT_QUANTIZE_MINIMUM_VERSION) | |
| # onnxruntime optimizations doesn't provide the same level of performances on TensorFlow than PyTorch | |
| if args.framework == "tf": | |
| print( | |
| "\t Using TensorFlow might not provide the same optimization level compared to PyTorch.\n" | |
| "\t For TensorFlow users you can try optimizing the model directly through onnxruntime_tools.\n" | |
| "\t For more information, please refer to the onnxruntime documentation:\n" | |
| "\t\thttps://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers\n" | |
| ) | |
| print("\n====== Optimizing ONNX model ======") | |
| # Quantization works best when using the optimized version of the model | |
| args.optimized_output = optimize(args.output) | |
| # Do the quantization on the right graph | |
| args.quantized_output = quantize(args.optimized_output) | |
| # And verify | |
| if args.check_loading: | |
| print("\n====== Check exported ONNX model(s) ======") | |
| verify(args.output) | |
| if hasattr(args, "optimized_output"): | |
| verify(args.optimized_output) | |
| if hasattr(args, "quantized_output"): | |
| verify(args.quantized_output) | |
| except Exception as e: | |
| print(f"Error while converting the model: {e}") | |
| exit(1) | |