Spaces:
Running
Running
| # Copyright 2025 The ODML Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| r"""CLI tool for building LiteRT-LM files. | |
| There are two ways to use this tool: | |
| 1. Building the file by specifying the components as CLI arguments: | |
| ``` | |
| bazel run //schema/py:litertlm_builder_cli -- \ | |
| system_metadata --str Authors "ODML team" \ | |
| llm_metadata --path llm.pb \ | |
| tflite_model --path embedder.tflite --model_type embedder --str_metadata model_version "1.0.1" \ | |
| tflite_model --path model.tflite --model_type prefill_decode \ | |
| sp_tokenizer --path sp.model \ | |
| output --path output.litertlm | |
| ``` | |
| Notes: | |
| - Constraints from litertlm_builder.py still apply. | |
| - The order of the components in the CLI arguments determines the order of the | |
| sections in the output LiteRT-LM file. | |
| - There can be multiple per section metadata. | |
| 2. Building the file by specifying the components as a TOML file: | |
| TOML file example: | |
| ``` | |
| [system_metadata] | |
| entries = [ | |
| { key = "author", value_type = "String", value = "The ODML Authors" } | |
| ] | |
| [[section]] | |
| # Section 0: LlmMetadataProto Can be a text or binary proto file. | |
| section_type = "LlmMetadata" | |
| data_path = "PATH/TO/LLM_METADATA.pb" | |
| [[section]] | |
| # Section 1: SP_Tokenizer (you can also use HF_Tokenizer) | |
| section_type = "SP_Tokenizer" | |
| data_path = "PATH/TO/SP_TOKENIZER.model" | |
| [[section]] | |
| # Section 2: TFLiteModel (Embedder) | |
| section_type = "TFLiteModel" | |
| model_type = "EMBEDDER" | |
| data_path = "PATH/TO/EMBEDDER.tflite" | |
| [[section]] | |
| # Section 3: TFLiteModel (Prefill/Decode) | |
| section_type = "TFLiteModel" | |
| model_type = "PREFILL_DECODE" | |
| data_path = "PATH/TO/PREFILL_DECODE.tflite" | |
| additional_metadata = [ | |
| { key = "License", value_type = "String", value = "Example" } | |
| { key = "model_version", value_type = "String", value = "1.0.1" } | |
| ] | |
| ``` | |
| ``` | |
| bazel run //schema/py:litertlm_builder_cli -- \ | |
| toml --path example.toml output --path output.litertlm | |
| ``` | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| from typing import BinaryIO, cast | |
| from absl import app | |
| from litert_lm.schema.py import litertlm_builder | |
| from litert_lm.schema.py import litertlm_core | |
| _SUBCOMMANDS = ( | |
| "toml", | |
| "system_metadata", | |
| "llm_metadata", | |
| "tflite_model", | |
| "tflite_weights", | |
| "sp_tokenizer", | |
| "hf_tokenizer", | |
| "output", | |
| ) | |
| def _add_toml_parser(subparsers) -> None: | |
| """Adds a parser for TOML file to the subparsers.""" | |
| toml_parser = subparsers.add_parser( | |
| "toml", | |
| description="Add a TOML file to the LiteRT-LM file.", | |
| help="Add a TOML file.", | |
| ) | |
| toml_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the TOML file.", | |
| ) | |
| def _add_system_metadata_parser(subparsers) -> None: | |
| """Adds a parser for system metadata to the subparsers.""" | |
| system_metadata_parser = subparsers.add_parser( | |
| "system_metadata", | |
| description=( | |
| "Add one or more system metadata key-value pairs to the LiteRT-LM" | |
| " file." | |
| ), | |
| help="Add system metadata.", | |
| ) | |
| system_metadata_parser.add_argument( | |
| "--str", | |
| nargs=2, | |
| action="append", | |
| metavar=("KEY", "VALUE"), | |
| required=False, | |
| help=( | |
| "A string key-value pair for the system metadata. Can be specified" | |
| " multiple times." | |
| ), | |
| ) | |
| system_metadata_parser.add_argument( | |
| "--int", | |
| nargs=2, | |
| action="append", | |
| metavar=("KEY", "VALUE"), | |
| required=False, | |
| help=( | |
| "An integer key-value pair for the system metadata. Can be specified" | |
| " multiple times." | |
| ), | |
| ) | |
| def _add_metadata_arguments(parser) -> None: | |
| """Adds arguments for metadata to the parser.""" | |
| parser.add_argument( | |
| "--str_metadata", | |
| nargs=2, | |
| action="append", | |
| metavar=("KEY", "VALUE"), | |
| required=False, | |
| help=( | |
| "A string key-value pair for the metadata. Can be specified" | |
| " multiple times." | |
| ), | |
| ) | |
| def _add_llm_metadata_parser(subparsers) -> None: | |
| """Adds a parser for llm metadata to the subparsers.""" | |
| llm_metadata_parser = subparsers.add_parser( | |
| "llm_metadata", | |
| description=( | |
| "Add llm metadata to the LiteRT-LM file. Can be a text or binary" | |
| " proto file." | |
| ), | |
| help="Add llm metadata.", | |
| ) | |
| llm_metadata_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the llm metadata file.", | |
| ) | |
| def _add_tflite_model_parser(subparsers) -> None: | |
| """Adds a parser for tflite model to the subparsers.""" | |
| tflite_model_parser = subparsers.add_parser( | |
| "tflite_model", | |
| description="Add a tflite model to the LiteRT-LM file.", | |
| help="Add a tflite model.", | |
| ) | |
| tflite_model_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the tflite model file.", | |
| ) | |
| tflite_model_parser.add_argument( | |
| "--model_type", | |
| type=str, | |
| required=True, | |
| choices=[ | |
| str(model_type.value).lower().replace("tf_lite_", "") | |
| for model_type in litertlm_builder.TfLiteModelType | |
| ], | |
| help="The type of the tflite model.", | |
| ) | |
| tflite_model_parser.add_argument( | |
| "--backend_constraint", | |
| type=str.lower, | |
| required=False, | |
| default=None, | |
| choices=list(litertlm_builder.Backend), | |
| help="A list of backend constraints for the tflite model.", | |
| ) | |
| _add_metadata_arguments(tflite_model_parser) | |
| def _add_tflite_weights_parser(subparsers) -> None: | |
| """Adds a parser for tflite weights to the subparsers.""" | |
| tflite_weights_parser = subparsers.add_parser( | |
| "tflite_weights", | |
| description="Add tflite weights to the LiteRT-LM file.", | |
| help="Add tflite weights.", | |
| ) | |
| tflite_weights_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the tflite weights file.", | |
| ) | |
| tflite_weights_parser.add_argument( | |
| "--model_type", | |
| type=str, | |
| required=True, | |
| choices=[ | |
| str(model_type.value).lower().replace("tf_lite_", "") | |
| for model_type in litertlm_builder.TfLiteModelType | |
| ], | |
| help="The type of the tflite model these weights correspond to.", | |
| ) | |
| _add_metadata_arguments(tflite_weights_parser) | |
| def _add_sentencepiece_tokenizer_parser(subparsers) -> None: | |
| """Adds a parser for sentencepiece tokenizer to the subparsers.""" | |
| sp_tokenizer_parser = subparsers.add_parser( | |
| "sp_tokenizer", | |
| description="Add a sentencepiece tokenizer to the LiteRT-LM file.", | |
| help="Add a sentencepiece tokenizer.", | |
| ) | |
| sp_tokenizer_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the sentencepiece tokenizer file.", | |
| ) | |
| _add_metadata_arguments(sp_tokenizer_parser) | |
| def _add_hf_tokenizer_parser(subparsers) -> None: | |
| """Adds a parser for huggingface tokenizer to the subparsers.""" | |
| hf_tokenizer_parser = subparsers.add_parser( | |
| "hf_tokenizer", | |
| description="Add a huggingface tokenizer to the LiteRT-LM file.", | |
| help="Add a huggingface tokenizer.", | |
| ) | |
| hf_tokenizer_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the huggingface tokenizer `tokenizer.json` file.", | |
| ) | |
| _add_metadata_arguments(hf_tokenizer_parser) | |
| def _add_output_path_parser(subparsers) -> None: | |
| """Adds an argument for the output path to the subparsers.""" | |
| output_path_parser = subparsers.add_parser( | |
| "output", | |
| description="The path to the output LiteRT-LM file.", | |
| help="The path to the output LiteRT-LM file.", | |
| ) | |
| output_path_parser.add_argument( | |
| "--path", | |
| type=str, | |
| required=True, | |
| help="The path to the output LiteRT-LM file.", | |
| ) | |
| def _build_parser() -> argparse.ArgumentParser: | |
| """Builds an argument parser for the litertlm_builder tool.""" | |
| parser = argparse.ArgumentParser( | |
| description="Build a LiteRT-LM file from input files and metadata." | |
| ) | |
| subparsers = parser.add_subparsers(dest="command", required=True) | |
| _add_toml_parser(subparsers) | |
| _add_system_metadata_parser(subparsers) | |
| _add_llm_metadata_parser(subparsers) | |
| _add_tflite_model_parser(subparsers) | |
| _add_tflite_weights_parser(subparsers) | |
| _add_sentencepiece_tokenizer_parser(subparsers) | |
| _add_hf_tokenizer_parser(subparsers) | |
| _add_output_path_parser(subparsers) | |
| return parser | |
| def _parse_args(parser: argparse.ArgumentParser) -> list[argparse.Namespace]: | |
| """Parses the command-line arguments. | |
| Args: | |
| parser: The argument parser to use. | |
| Returns: | |
| A list of parsed argument namespaces. | |
| Raises: | |
| ValueError: If there are unparsed arguments. | |
| """ | |
| args = sys.argv[1:] | |
| if len(args) == 1 and args[0] in ["--help", "-h"]: | |
| print(parser.format_help()) | |
| return [] | |
| # We need to break the arguments into subcommands to ensure overlapping flags | |
| # are handled correctly. For example, "--path" is a flag for both | |
| # "llm_metadata" and "output". | |
| subcommands = [] | |
| current_subcommand = [] | |
| for arg in args: | |
| if arg in _SUBCOMMANDS: | |
| if current_subcommand: | |
| subcommands.append(current_subcommand) | |
| current_subcommand = [arg] | |
| else: | |
| assert current_subcommand, ( | |
| f"No subcommand found for argument: {arg}. Use --help for a list of" | |
| " subcommands." | |
| ) | |
| current_subcommand.append(arg) | |
| if current_subcommand: | |
| subcommands.append(current_subcommand) | |
| parsed_args = [] | |
| for subcommand in subcommands: | |
| parsed, unparsed = parser.parse_known_args(args=subcommand) | |
| if unparsed: | |
| raise ValueError( | |
| f"Failed to parse all arguments. Unparsed args: {unparsed}" | |
| ) | |
| parsed_args.append(parsed) | |
| return parsed_args | |
| def _build_system_metadata( | |
| args: argparse.Namespace, | |
| builder: litertlm_builder.LitertLmFileBuilder, | |
| ) -> None: | |
| """Builds system metadata from the parsed arguments.""" | |
| if args.str: | |
| for str_metadata in args.str: | |
| key, value = str_metadata | |
| builder.add_system_metadata( | |
| litertlm_builder.Metadata( | |
| key=key, | |
| value=value, | |
| dtype=litertlm_builder.DType.STRING, | |
| ) | |
| ) | |
| if args.int: | |
| for int_metadata in args.int: | |
| key, value = int_metadata | |
| builder.add_system_metadata( | |
| litertlm_builder.Metadata( | |
| key=key, | |
| value=int(value), | |
| dtype=litertlm_builder.DType.INT32, | |
| ) | |
| ) | |
| def _get_metadata_from_args( | |
| args: argparse.Namespace, | |
| ) -> list[litertlm_builder.Metadata] | None: | |
| """Builds metadata from the parsed arguments.""" | |
| metadata = [] | |
| if hasattr(args, "str_metadata") and args.str_metadata: | |
| for str_metadata in args.str_metadata: | |
| key, value = str_metadata | |
| metadata.append( | |
| litertlm_builder.Metadata( | |
| key=key, | |
| value=value, | |
| dtype=litertlm_builder.DType.STRING, | |
| ) | |
| ) | |
| return metadata if metadata else None | |
| def _build_llm_metadata( | |
| args: argparse.Namespace, | |
| builder: litertlm_builder.LitertLmFileBuilder, | |
| ) -> None: | |
| """Builds llm metadata from the parsed arguments.""" | |
| metadata = _get_metadata_from_args(args) | |
| builder.add_llm_metadata(args.path, additional_metadata=metadata) | |
| def _build_tflite_model( | |
| args: argparse.Namespace, | |
| builder: litertlm_builder.LitertLmFileBuilder, | |
| ) -> None: | |
| """Builds tflite model from the parsed arguments.""" | |
| metadata = _get_metadata_from_args(args) | |
| builder.add_tflite_model( | |
| args.path, | |
| litertlm_builder.TfLiteModelType.get_enum_from_tf_free_value( | |
| args.model_type | |
| ), | |
| backend_constraint=args.backend_constraint, | |
| additional_metadata=metadata, | |
| ) | |
| def _build_tflite_weights( | |
| args: argparse.Namespace, | |
| builder: litertlm_builder.LitertLmFileBuilder, | |
| ) -> None: | |
| """Builds tflite weights from the parsed arguments.""" | |
| metadata = _get_metadata_from_args(args) | |
| builder.add_tflite_weights( | |
| args.path, | |
| litertlm_builder.TfLiteModelType.get_enum_from_tf_free_value( | |
| args.model_type | |
| ), | |
| additional_metadata=metadata, | |
| ) | |
| def _build_sp_tokenizer( | |
| args: argparse.Namespace, | |
| builder: litertlm_builder.LitertLmFileBuilder, | |
| ) -> None: | |
| """Builds sentencepiece tokenizer from the parsed arguments.""" | |
| metadata = _get_metadata_from_args(args) | |
| builder.add_sentencepiece_tokenizer(args.path, additional_metadata=metadata) | |
| def _build_hf_tokenizer( | |
| args: argparse.Namespace, | |
| builder: litertlm_builder.LitertLmFileBuilder, | |
| ) -> None: | |
| """Builds huggingface tokenizer from the parsed arguments.""" | |
| metadata = _get_metadata_from_args(args) | |
| builder.add_hf_tokenizer(args.path, additional_metadata=metadata) | |
| def _build_litertlm_file(parsed_args: list[argparse.Namespace]) -> None: | |
| """Builds a LiteRT-LM file from the parsed arguments.""" | |
| if "toml" in [pa.command for pa in parsed_args]: | |
| toml_path = None | |
| output_path = None | |
| for parsed_arg in parsed_args: | |
| match parsed_arg.command: | |
| case "output": | |
| output_path = parsed_arg.path | |
| case "toml": | |
| toml_path = parsed_arg.path | |
| case _: | |
| raise ValueError( | |
| "When using TOML, only output and toml are supported." | |
| ) | |
| assert output_path, "Output path is required." | |
| assert toml_path, "TOML path is required." | |
| output_dir = os.path.dirname(output_path) | |
| if output_dir: | |
| os.makedirs(output_dir, exist_ok=True) | |
| with litertlm_core.open_file(output_path, "wb") as f: | |
| builder = litertlm_builder.LitertLmFileBuilder.from_toml_file(toml_path) | |
| builder.build(f) | |
| else: | |
| builder = litertlm_builder.LitertLmFileBuilder() | |
| output_path = None | |
| for parsed_arg in parsed_args: | |
| match parsed_arg.command: | |
| case "system_metadata": | |
| _build_system_metadata(parsed_arg, builder) | |
| case "llm_metadata": | |
| _build_llm_metadata(parsed_arg, builder) | |
| case "tflite_model": | |
| _build_tflite_model(parsed_arg, builder) | |
| case "tflite_weights": | |
| _build_tflite_weights(parsed_arg, builder) | |
| case "sp_tokenizer": | |
| _build_sp_tokenizer(parsed_arg, builder) | |
| case "hf_tokenizer": | |
| _build_hf_tokenizer(parsed_arg, builder) | |
| case "output": | |
| output_path = parsed_arg.path | |
| case _: | |
| raise ValueError(f"Unknown subcommand: {parsed_arg.command}") | |
| assert output_path, "Output path is required." | |
| output_dir = os.path.dirname(output_path) | |
| if output_dir: | |
| os.makedirs(output_dir, exist_ok=True) | |
| with litertlm_core.open_file(output_path, "wb") as f: | |
| builder.build(cast(BinaryIO, f)) | |
| print(f"LiteRT-LM file successfully created at {output_path}") | |
| def main(_) -> None: | |
| parser = _build_parser() | |
| parsed_args = _parse_args(parser) | |
| if not parsed_args: | |
| return | |
| _build_litertlm_file(parsed_args) | |
| def run(): | |
| """Entry point for console_scripts.""" | |
| app.run(main, sys.argv[:1]) | |
| if __name__ == "__main__": | |
| app.run(main, sys.argv[:1]) | |