Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

convert_checkpoint.py +231 -0
encoder.fp16.onnx +3 -0
export_encoder_tensorrt.py +257 -0
export_tensorrt.sh +54 -0
tllm_checkpoint_float16/decoder/config.json +38 -0
tllm_checkpoint_float16/decoder/rank0.safetensors +3 -0

convert_checkpoint.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import argparse
+import json
+import os
+import time
+import torch
+from safetensors.torch import save_file
+import tensorrt_llm
+from tensorrt_llm.functional import LayerNormPositionType, LayerNormType
+from tensorrt_llm.models.convert_utils import weight_only_quantize_dict
+from tensorrt_llm.quantization import QuantAlgo
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_path', type=str, required=True,
+                        help="Path to the FireRedASR model.pth.tar checkpoint.")
+    parser.add_argument('--output_dir', type=str, default='tllm_checkpoint',
+                        help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument('--dtype', type=str, default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument('--logits_dtype', type=str, default='float16',
+                        choices=['float16', 'float32'])
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    return parser.parse_args()
+def get_decoder_config(model_args, dtype: str, logits_dtype: str, quant_algo: QuantAlgo) -> dict:
+    return {
+        'architecture': "DecoderModel",
+        'dtype': dtype,
+        'logits_dtype': logits_dtype,
+        'num_hidden_layers': model_args.n_layers_dec,
+        'num_attention_heads': model_args.n_head,
+        'hidden_size': model_args.d_model,
+        'norm_epsilon': 1e-5,
+        'vocab_size': model_args.odim,
+        'hidden_act': "gelu",
+        'use_parallel_embedding': False,
+        'embedding_sharding_dim': 0,
+        'max_position_embeddings': model_args.pe_maxlen,
+        'use_prompt_tuning': False,
+        'head_size': model_args.d_model // model_args.n_head,
+        'has_position_embedding': True,
+        'layernorm_type': LayerNormType.LayerNorm,
+        'has_attention_qkvo_bias': True,
+        'has_mlp_bias': True,
+        'has_model_final_layernorm': True,
+        'has_embedding_layernorm': False,
+        'has_embedding_scale': True, # FireRedASR scales the embedding
+        'ffn_hidden_size': 4 * model_args.d_model,
+        'q_scaling': 1.0,
+        'layernorm_position': LayerNormPositionType.pre_layernorm,
+        'relative_attention': False,
+        'max_distance': 0,
+        'num_buckets': 0,
+        'model_type': 'whisper', # To align with Whisper decoder architecture in TRT-LLM
+        'rescale_before_lm_head': False,
+        'encoder_hidden_size': model_args.d_model,
+        'encoder_num_heads': model_args.n_head,
+        'encoder_head_size': None,
+        'skip_cross_kv': False,
+        'quantization': {
+            'quant_algo': quant_algo
+        },
+    }
+def remap_state_dict(original_state_dict):
+    new_state_dict = {}
+    for key, value in original_state_dict.items():
+        if key.startswith("decoder."):
+            new_key = key
+            # Top-level decoder module renames
+            new_key = new_key.replace("decoder.tgt_word_emb.", "decoder.token_embedding.")
+            new_key = new_key.replace("decoder.layer_stack.", "decoder.blocks.")
+            new_key = new_key.replace("decoder.layer_norm_out.", "decoder.ln.")
+            new_key = new_key.replace("decoder.tgt_word_prj.", "decoder.output_projection.")
+            # ResidualAttentionBlock internal layer renames
+            new_key = new_key.replace(".self_attn_norm.", ".attn_ln.")
+            new_key = new_key.replace(".self_attn.", ".attn.")
+            new_key = new_key.replace(".cross_attn_norm.", ".cross_attn_ln.")
+            new_key = new_key.replace(".cross_attn.", ".cross_attn.")
+            new_key = new_key.replace(".mlp_norm.", ".mlp_ln.")
+            # Inlined PositionwiseFeedForward renames
+            new_key = new_key.replace(".mlp.w_1.", ".mlp.0.")
+            new_key = new_key.replace(".mlp.w_2.", ".mlp.2.")
+            # MultiHeadAttention submodule renames
+            new_key = new_key.replace(".w_qs.", ".query.")
+            new_key = new_key.replace(".w_ks.", ".key.")
+            new_key = new_key.replace(".w_vs.", ".value.")
+            new_key = new_key.replace(".fc.", ".out.")
+            new_state_dict[new_key] = value
+    # Manually handle sinusoidal positional encoding -> learnable embedding
+    if "decoder.positional_encoding.pe" in original_state_dict:
+         new_state_dict["decoder.positional_embedding"] = original_state_dict["decoder.positional_encoding.pe"].squeeze(0)
+    return new_state_dict
+def convert_firered_decoder(model_args, model_params, quant_algo: str = None):
+    weights = {}
+    # The original model shares embedding and projection weights.
+    # TRT-LLM's DecoderModel expects separate lm_head.weight
+    weights['transformer.vocab_embedding.weight'] = model_params['decoder.token_embedding.weight']
+    weights['lm_head.weight'] = model_params['decoder.output_projection.weight']
+    weights['transformer.position_embedding.weight'] = model_params['decoder.positional_embedding']
+    for i in range(model_args.n_layers_dec):
+        trtllm_layer_name_prefix = f'transformer.layers.{i}'
+        # Self Attention
+        q_w = model_params[f'decoder.blocks.{i}.attn.query.weight']
+        k_w = model_params[f'decoder.blocks.{i}.attn.key.weight']
+        v_w = model_params[f'decoder.blocks.{i}.attn.value.weight']
+        weights[f'{trtllm_layer_name_prefix}.self_attention.qkv.weight'] = torch.cat([q_w, k_w, v_w], dim=0)
+        q_b = model_params[f'decoder.blocks.{i}.attn.query.bias']
+        # The key projection has no bias in Whisper's MultiHeadAttention
+        k_b = torch.zeros_like(q_b)
+        v_b = model_params[f'decoder.blocks.{i}.attn.value.bias']
+        weights[f'{trtllm_layer_name_prefix}.self_attention.qkv.bias'] = torch.cat([q_b, k_b, v_b], dim=0)
+        weights[f'{trtllm_layer_name_prefix}.self_attention.dense.weight'] = model_params[f'decoder.blocks.{i}.attn.out.weight']
+        weights[f'{trtllm_layer_name_prefix}.self_attention.dense.bias'] = model_params[f'decoder.blocks.{i}.attn.out.bias']
+        weights[f'{trtllm_layer_name_prefix}.self_attention_layernorm.weight'] = model_params[f'decoder.blocks.{i}.attn_ln.weight']
+        weights[f'{trtllm_layer_name_prefix}.self_attention_layernorm.bias'] = model_params[f'decoder.blocks.{i}.attn_ln.bias']
+        # Cross Attention
+        q_w = model_params[f'decoder.blocks.{i}.cross_attn.query.weight']
+        k_w = model_params[f'decoder.blocks.{i}.cross_attn.key.weight']
+        v_w = model_params[f'decoder.blocks.{i}.cross_attn.value.weight']
+        weights[f'{trtllm_layer_name_prefix}.cross_attention.qkv.weight'] = torch.cat([q_w, k_w, v_w], dim=0)
+        q_b = model_params[f'decoder.blocks.{i}.cross_attn.query.bias']
+        # The key projection has no bias in Whisper's MultiHeadAttention
+        k_b = torch.zeros_like(q_b)
+        v_b = model_params[f'decoder.blocks.{i}.cross_attn.value.bias']
+        weights[f'{trtllm_layer_name_prefix}.cross_attention.qkv.bias'] = torch.cat([q_b, k_b, v_b], dim=0)
+        weights[f'{trtllm_layer_name_prefix}.cross_attention.dense.weight'] = model_params[f'decoder.blocks.{i}.cross_attn.out.weight']
+        weights[f'{trtllm_layer_name_prefix}.cross_attention.dense.bias'] = model_params[f'decoder.blocks.{i}.cross_attn.out.bias']
+        weights[f'{trtllm_layer_name_prefix}.cross_attention_layernorm.weight'] = model_params[f'decoder.blocks.{i}.cross_attn_ln.weight']
+        weights[f'{trtllm_layer_name_prefix}.cross_attention_layernorm.bias'] = model_params[f'decoder.blocks.{i}.cross_attn_ln.bias']
+        # MLP
+        weights[f'{trtllm_layer_name_prefix}.mlp.fc.weight'] = model_params[f'decoder.blocks.{i}.mlp.0.weight']
+        weights[f'{trtllm_layer_name_prefix}.mlp.fc.bias'] = model_params[f'decoder.blocks.{i}.mlp.0.bias']
+        weights[f'{trtllm_layer_name_prefix}.mlp.proj.weight'] = model_params[f'decoder.blocks.{i}.mlp.2.weight']
+        weights[f'{trtllm_layer_name_prefix}.mlp.proj.bias'] = model_params[f'decoder.blocks.{i}.mlp.2.bias']
+        weights[f'{trtllm_layer_name_prefix}.mlp_layernorm.weight'] = model_params[f'decoder.blocks.{i}.mlp_ln.weight']
+        weights[f'{trtllm_layer_name_prefix}.mlp_layernorm.bias'] = model_params[f'decoder.blocks.{i}.mlp_ln.bias']
+    weights['transformer.ln_f.weight'] = model_params['decoder.ln.weight']
+    weights['transformer.ln_f.bias'] = model_params['decoder.ln.bias']
+    if quant_algo is not None:
+        return weight_only_quantize_dict(weights, quant_algo=quant_algo)
+    return weights
+if __name__ == '__main__':
+    print(f"Using TensorRT-LLM version: {tensorrt_llm.__version__}")
+    args = parse_arguments()
+    tik = time.time()
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    quant_algo = None
+    if args.use_weight_only and args.weight_only_precision == 'int8':
+        quant_algo = QuantAlgo.W8A16
+    elif args.use_weight_only and args.weight_only_precision == 'int4':
+        quant_algo = QuantAlgo.W4A16
+    # Load the original checkpoint
+    package = torch.load(args.model_path, map_location='cpu', weights_only=False)
+    model_args = package["args"]
+    original_state_dict = package["model_state_dict"]
+    print(f"Successfully loaded checkpoint from {args.model_path}")
+    print("Original model args:", model_args)
+    # Remap state dict keys for Whisper compatibility
+    remapped_state_dict = remap_state_dict(original_state_dict)
+    # Set tensor dtype
+    tensor_dtype = getattr(torch, args.dtype)
+    for key, value in remapped_state_dict.items():
+        remapped_state_dict[key] = value.to(tensor_dtype)
+    # Generate config and convert weights
+    print("Converting decoder checkpoint...")
+    decoder_config = get_decoder_config(model_args, args.dtype, args.logits_dtype, quant_algo)
+    decoder_weights = convert_firered_decoder(model_args, remapped_state_dict, quant_algo)
+    # Save the decoder config and weights
+    decoder_save_dir = os.path.join(args.output_dir, "decoder")
+    if not os.path.exists(decoder_save_dir):
+        os.makedirs(decoder_save_dir)
+    with open(os.path.join(decoder_save_dir, 'config.json'), 'w') as f:
+        json.dump(decoder_config, f, indent=4)
+    save_file(decoder_weights, os.path.join(decoder_save_dir, f'rank0.safetensors'))
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Checkpoint successfully converted and saved to {args.output_dir}.')
+    print(f'Total time of converting checkpoints: {t}')

encoder.fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:979d55f4cecfb651720b037802649f39acb6c235f048c62f7ddb8a1a30bebda8
+size 1447173731

export_encoder_tensorrt.py ADDED Viewed

	@@ -0,0 +1,257 @@

+#!/usr/bin/env python3
+# Copyright         2025  Xiaomi Corp.        (authors: Zengwei Yao)
+# Copyright         2025  Nvidia Corp.        (authors: Yuekai Zhang)
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script exports a pre-trained FireRedASR encoder model from PyTorch to
+ONNX and TensorRT.
+Usage:
+python3 examples/export_encoder_tensorrt.py \
+    --model-dir /path/to/your/model_dir \
+    --tensorrt-model-dir ./tensorrt_models \
+    --trt-engine-file-name encoder.plan
+"""
+import argparse
+import logging
+from pathlib import Path
+import torch
+import tensorrt as trt
+from fireredasr.models.fireredasr import load_fireredasr_aed_model
+def get_parser() -> argparse.ArgumentParser:
+    """Get the command-line argument parser."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--model-dir",
+        type=str,
+        default=None,
+        help="The model directory that contains model checkpoint.",
+    )
+    parser.add_argument(
+        "--onnx-model-path",
+        type=str,
+        default=None,
+        help="If specified, we will directly use this onnx model to generate "
+        "the tensorrt engine",
+    )
+    parser.add_argument(
+        "--idim",
+        type=int,
+        default=80,
+        help="The input dimension of the model. This is required when "
+        "--onnx-model-path is specified.",
+    )
+    parser.add_argument(
+        "--tensorrt-model-dir",
+        type=str,
+        default="exp",
+        help="Directory to save the exported models.",
+    )
+    parser.add_argument(
+        "--trt-engine-file-name",
+        type=str,
+        default="encoder.plan",
+        help="The name of the TensorRT engine file.",
+    )
+    parser.add_argument(
+        "--opset-version",
+        type=int,
+        default=17,
+        help="ONNX opset version.",
+    )
+    return parser
+def export_encoder_onnx(
+    encoder: torch.nn.Module,
+    filename: str,
+    idim: int,
+    opset_version: int = 17,
+) -> None:
+    """Export the conformer encoder model to ONNX format."""
+    logging.info("Exporting encoder to ONNX")
+    encoder.half()
+    # Create dummy inputs
+    seq_len = 400  # A typical sequence length
+    batch_size = 1
+    padded_input = torch.randn(batch_size, seq_len, idim, dtype=torch.float16)
+    input_lengths = torch.tensor([seq_len] * batch_size, dtype=torch.int32)
+    # Export
+    torch.onnx.export(
+        encoder,
+        (padded_input, input_lengths),
+        filename,
+        opset_version=opset_version,
+        input_names=["padded_input", "input_lengths"],
+        output_names=["enc_output", "output_lengths", "src_mask"],
+        dynamic_axes={
+            "padded_input": {0: "batch_size", 1: "seq_len"},
+            "input_lengths": {0: "batch_size"},
+            "enc_output": {0: "batch_size", 1: "seq_len_out"},
+            "output_lengths": {0: "batch_size",},
+            "src_mask": {0: "batch_size", 2: "seq_len_out"},
+        },
+    )
+    logging.info(f"Exported encoder to {filename}")
+def get_trt_kwargs_dynamic_batch(
+    idim: int,
+    min_batch_size: int = 1,
+    opt_batch_size: int = 4,
+    max_batch_size: int = 64,
+):
+    """Get keyword arguments for TensorRT with dynamic batch size."""
+    min_seq_len = 50
+    opt_seq_len = 400
+    max_seq_len = 3000
+    min_shape = [(min_batch_size, min_seq_len, idim), (min_batch_size,)]
+    opt_shape = [(opt_batch_size, opt_seq_len, idim), (opt_batch_size,)]
+    max_shape = [(max_batch_size, max_seq_len, idim), (max_batch_size,)]
+    input_names = ["padded_input", "input_lengths"]
+    return {
+        "min_shape": min_shape,
+        "opt_shape": opt_shape,
+        "max_shape": max_shape,
+        "input_names": input_names,
+    }
+def convert_onnx_to_trt(
+    trt_model: str, trt_kwargs: dict, onnx_model: str, dtype: torch.dtype = torch.float16
+) -> None:
+    """Convert an ONNX model to a TensorRT engine."""
+    logging.info("Converting ONNX to TensorRT engine...")
+    network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    logger = trt.Logger(trt.Logger.INFO)
+    builder = trt.Builder(logger)
+    network = builder.create_network(network_flags)
+    parser = trt.OnnxParser(network, logger)
+    config = builder.create_builder_config()
+    if dtype == torch.float16:
+        config.set_flag(trt.BuilderFlag.FP16)
+    profile = builder.create_optimization_profile()
+    with open(onnx_model, "rb") as f:
+        if not parser.parse(f.read()):
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+            raise ValueError(f'Failed to parse {onnx_model}')
+    for i, name in enumerate(trt_kwargs['input_names']):
+        profile.set_shape(
+            name,
+            trt_kwargs['min_shape'][i],
+            trt_kwargs['opt_shape'][i],
+            trt_kwargs['max_shape'][i]
+        )
+    config.add_optimization_profile(profile)
+    try:
+        engine_bytes = builder.build_serialized_network(network, config)
+    except Exception as e:
+        logging.error(f"TensorRT engine build failed: {e}")
+        return
+    with open(trt_model, "wb") as f:
+        f.write(engine_bytes)
+    logging.info("Successfully converted ONNX to TensorRT.")
+@torch.no_grad()
+def main():
+    """Main function to export the model."""
+    parser = get_parser()
+    args = parser.parse_args()
+    tensorrt_model_dir = Path(args.tensorrt_model_dir)
+    tensorrt_model_dir.mkdir(parents=True, exist_ok=True)
+    if args.onnx_model_path:
+        logging.info(f"Using provided ONNX model: {args.onnx_model_path}")
+        if not args.idim:
+            raise ValueError("--idim is required when using --onnx-model-path")
+        idim = args.idim
+        encoder_onnx_file = Path(args.onnx_model_path)
+        if not encoder_onnx_file.is_file():
+            raise FileNotFoundError(f"ONNX model not found at {encoder_onnx_file}")
+    else:
+        if not args.model_dir:
+            raise ValueError(
+                "--model-dir is required if --onnx-model-path is not provided"
+            )
+        logging.info("Exporting ONNX model from PyTorch checkpoint")
+        model_dir = Path(args.model_dir)
+        model_path = model_dir / "model.pth.tar"
+        # Load model to get encoder
+        package = torch.load(model_path, map_location="cpu", weights_only=False)
+        model_args = package["args"]
+        idim = model_args.idim
+        # We have to load the full AED model to get the encoder with weights
+        model = load_fireredasr_aed_model(str(model_path))
+        encoder = model.encoder
+        encoder.eval()
+        # Export ONNX
+        encoder_onnx_file = tensorrt_model_dir / "encoder.fp16.onnx"
+        export_encoder_onnx(
+            encoder=encoder,
+            filename=str(encoder_onnx_file),
+            idim=idim,
+            opset_version=args.opset_version,
+        )
+    # Convert ONNX to TensorRT
+    trt_engine_file = tensorrt_model_dir / args.trt_engine_file_name
+    trt_kwargs = get_trt_kwargs_dynamic_batch(idim=idim)
+    convert_onnx_to_trt(
+        trt_model=str(trt_engine_file),
+        trt_kwargs=trt_kwargs,
+        onnx_model=str(encoder_onnx_file),
+        dtype=torch.float16,
+    )
+    logging.info("Done!")
+if __name__ == "__main__":
+    formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
+    logging.basicConfig(format=formatter, level=logging.INFO)
+    main()

export_tensorrt.sh ADDED Viewed

	@@ -0,0 +1,54 @@

+export PATH=$PWD/fireredasr/:$PWD/fireredasr/utils/:$PATH
+export PYTHONPATH=$PWD/:$PYTHONPATH
+# model_path=pretrained_models/FireRedASR-AED-L
+# python3 export_encoder_tensorrt.py \
+#     --model-dir $model_path \
+#     --tensorrt-model-dir $TRT_ENGINE_OUTPUT_DIR \
+#     --trt-engine-file-name encoder.plan
+TRT_ENGINE_OUTPUT_DIR=./FireRedASR-AED-L-TensorRT
+python3 export_encoder_tensorrt.py \
+    --onnx-model-path $TRT_ENGINE_OUTPUT_DIR/encoder.fp16.onnx \
+    --tensorrt-model-dir $TRT_ENGINE_OUTPUT_DIR \
+    --trt-engine-file-name encoder.plan
+INFERENCE_PRECISION=float16
+MAX_BEAM_WIDTH=4
+MAX_BATCH_SIZE=64
+checkpoint_dir=$TRT_ENGINE_OUTPUT_DIR/tllm_checkpoint_float16
+output_dir=$TRT_ENGINE_OUTPUT_DIR/trt_engine_${INFERENCE_PRECISION}
+# model_path=pretrained_models/FireRedASR-AED-L/model.pth.tar
+# python3 convert_checkpoint.py \
+#                 --dtype ${INFERENCE_PRECISION} \
+#                 --model_path $model_path \
+#                 --output_dir $checkpoint_dir
+trtllm-build  --checkpoint_dir ${checkpoint_dir}/decoder \
+              --output_dir ${output_dir}/decoder \
+              --moe_plugin disable \
+              --max_beam_width ${MAX_BEAM_WIDTH} \
+              --max_batch_size ${MAX_BATCH_SIZE} \
+              --max_seq_len 512 \
+              --max_input_len 4 \
+              --max_encoder_input_len 1024 \
+              --gemm_plugin ${INFERENCE_PRECISION} \
+              --remove_input_padding disable \
+              --paged_kv_cache disable \
+              --gpt_attention_plugin ${INFERENCE_PRECISION}
+# FireRedASR-AED-L-TensorRT/
+# ├── encoder.fp16.onnx
+# ├── encoder.plan
+# ├── tllm_checkpoint_float16
+# │   └── decoder
+# │       ├── config.json
+# │       └── rank0.safetensors
+# └── trt_engine_float16
+#     └── decoder
+#         ├── config.json
+#         └── rank0.engine

tllm_checkpoint_float16/decoder/config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "architecture": "DecoderModel",
+    "dtype": "float16",
+    "logits_dtype": "float16",
+    "num_hidden_layers": 16,
+    "num_attention_heads": 20,
+    "hidden_size": 1280,
+    "norm_epsilon": 1e-05,
+    "vocab_size": 7832,
+    "hidden_act": "gelu",
+    "use_parallel_embedding": false,
+    "embedding_sharding_dim": 0,
+    "max_position_embeddings": 5000,
+    "use_prompt_tuning": false,
+    "head_size": 64,
+    "has_position_embedding": true,
+    "layernorm_type": 0,
+    "has_attention_qkvo_bias": true,
+    "has_mlp_bias": true,
+    "has_model_final_layernorm": true,
+    "has_embedding_layernorm": false,
+    "has_embedding_scale": true,
+    "ffn_hidden_size": 5120,
+    "q_scaling": 1.0,
+    "layernorm_position": 0,
+    "relative_attention": false,
+    "max_distance": 0,
+    "num_buckets": 0,
+    "model_type": "whisper",
+    "rescale_before_lm_head": false,
+    "encoder_hidden_size": 1280,
+    "encoder_num_heads": 20,
+    "encoder_head_size": null,
+    "skip_cross_kv": false,
+    "quantization": {
+        "quant_algo": null
+    }
+}

tllm_checkpoint_float16/decoder/rank0.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fae4a3ce0ab15552d307ef960a579c25f479d490b65959cf4189e7a723463037
+size 892578184