ibitec commited on
Commit
dacea24
·
verified ·
1 Parent(s): a750ef7

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  engines/flan-t5-large/int8_wo_cpu/1-gpu/decoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
37
  engines/flan-t5-large/int8_wo_cpu/1-gpu/encoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  engines/flan-t5-large/int8_wo_cpu/1-gpu/decoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
37
  engines/flan-t5-large/int8_wo_cpu/1-gpu/encoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
38
+ int8_wo_cpu/1-gpu/decoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
39
+ int8_wo_cpu/1-gpu/encoder/rank0.engine filter=lfs diff=lfs merge=lfs -text
int8_wo_cpu/1-gpu/decoder/config.json ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.3.0rc8",
3
+ "pretrained_config": {
4
+ "architecture": "DecoderModel",
5
+ "dtype": "float16",
6
+ "vocab_size": 32128,
7
+ "hidden_size": 1024,
8
+ "num_hidden_layers": 24,
9
+ "num_attention_heads": 16,
10
+ "hidden_act": "gelu",
11
+ "logits_dtype": "float16",
12
+ "norm_epsilon": 1e-06,
13
+ "runtime_defaults": null,
14
+ "position_embedding_type": "relative",
15
+ "num_key_value_heads": 16,
16
+ "intermediate_size": 2816,
17
+ "max_position_embeddings": 512,
18
+ "mapping": {
19
+ "world_size": 1,
20
+ "gpus_per_node": 8,
21
+ "cp_size": 1,
22
+ "tp_size": 1,
23
+ "pp_size": 1,
24
+ "moe_tp_size": 1,
25
+ "moe_cluster_size": 1,
26
+ "moe_ep_size": 1,
27
+ "attn_tp_size": 1,
28
+ "attn_cp_size": 1,
29
+ "cp_config": {},
30
+ "enable_attention_dp": false,
31
+ "enable_lm_head_tp_in_adp": false
32
+ },
33
+ "quantization": {
34
+ "quant_algo": "W8A16",
35
+ "kv_cache_quant_algo": null,
36
+ "group_size": 128,
37
+ "smoothquant_val": 0.5,
38
+ "clamp_val": null,
39
+ "use_meta_recipe": false,
40
+ "has_zero_point": false,
41
+ "pre_quant_scale": false,
42
+ "exclude_modules": null,
43
+ "mamba_ssm_cache_dtype": null,
44
+ "mamba_ssm_stochastic_rounding": false,
45
+ "mamba_ssm_philox_rounds": 10
46
+ },
47
+ "use_parallel_embedding": true,
48
+ "embedding_sharding_dim": 0,
49
+ "head_size": 64,
50
+ "qk_layernorm": false,
51
+ "rotary_embedding_dim": 64,
52
+ "producer": {
53
+ "name": "modelopt",
54
+ "version": "0.37.0"
55
+ },
56
+ "share_embedding_table": false,
57
+ "residual_mlp": false,
58
+ "bias": false,
59
+ "rotary_pct": 1.0,
60
+ "rank": 0,
61
+ "decoder": "t5",
62
+ "rmsnorm": true,
63
+ "lm_head_bias": false,
64
+ "has_position_embedding": false,
65
+ "layernorm_type": 1,
66
+ "has_attention_qkvo_bias": false,
67
+ "has_mlp_bias": false,
68
+ "has_model_final_layernorm": true,
69
+ "mlp_type": 1,
70
+ "use_prompt_tuning": false,
71
+ "has_embedding_layernorm": false,
72
+ "has_embedding_scale": false,
73
+ "ffn_hidden_size": 2816,
74
+ "q_scaling": 0.125,
75
+ "layernorm_position": 0,
76
+ "relative_attention": true,
77
+ "max_distance": 128,
78
+ "num_buckets": 32,
79
+ "model_type": "t5",
80
+ "use_implicit_relative_attention": false,
81
+ "rescale_before_lm_head": false,
82
+ "encoder_hidden_size": 1024,
83
+ "encoder_num_heads": 16,
84
+ "encoder_head_size": 64,
85
+ "decoder_start_token_id": 0,
86
+ "eos_token_id": 1,
87
+ "bos_token_id": null,
88
+ "pad_token_id": 0,
89
+ "type_vocab_size": null,
90
+ "encoder_num_kv_heads": null,
91
+ "skip_cross_kv": false,
92
+ "residual_scaling": 1.0,
93
+ "has_lm_head_bias": false
94
+ },
95
+ "build_config": {
96
+ "max_input_len": 1,
97
+ "max_seq_len": 129,
98
+ "opt_batch_size": 8,
99
+ "max_batch_size": 4,
100
+ "max_beam_width": 1,
101
+ "max_num_tokens": 516,
102
+ "opt_num_tokens": 4,
103
+ "max_prompt_embedding_table_size": 0,
104
+ "kv_cache_type": "paged",
105
+ "gather_context_logits": false,
106
+ "gather_generation_logits": false,
107
+ "strongly_typed": true,
108
+ "force_num_profiles": null,
109
+ "profiling_verbosity": "layer_names_only",
110
+ "enable_debug_output": false,
111
+ "max_draft_len": 0,
112
+ "speculative_decoding_mode": 1,
113
+ "use_refit": false,
114
+ "input_timing_cache": null,
115
+ "output_timing_cache": "model.cache",
116
+ "lora_config": {
117
+ "lora_dir": [],
118
+ "lora_ckpt_source": "hf",
119
+ "max_lora_rank": 64,
120
+ "lora_target_modules": [],
121
+ "trtllm_modules_to_hf_modules": {},
122
+ "max_loras": null,
123
+ "max_cpu_loras": null,
124
+ "swap_gate_up_proj_lora_b_weight": true
125
+ },
126
+ "weight_sparsity": false,
127
+ "weight_streaming": false,
128
+ "plugin_config": {
129
+ "dtype": "float16",
130
+ "bert_attention_plugin": "auto",
131
+ "gpt_attention_plugin": "auto",
132
+ "gemm_plugin": "auto",
133
+ "gemm_swiglu_plugin": null,
134
+ "fp8_rowwise_gemm_plugin": null,
135
+ "qserve_gemm_plugin": null,
136
+ "identity_plugin": null,
137
+ "nccl_plugin": null,
138
+ "lora_plugin": null,
139
+ "dora_plugin": false,
140
+ "weight_only_groupwise_quant_matmul_plugin": null,
141
+ "weight_only_quant_matmul_plugin": "float16",
142
+ "smooth_quant_plugins": true,
143
+ "smooth_quant_gemm_plugin": null,
144
+ "layernorm_quantization_plugin": null,
145
+ "rmsnorm_quantization_plugin": null,
146
+ "quantize_per_token_plugin": false,
147
+ "quantize_tensor_plugin": false,
148
+ "moe_plugin": null,
149
+ "mamba_conv1d_plugin": "auto",
150
+ "low_latency_gemm_plugin": null,
151
+ "low_latency_gemm_swiglu_plugin": null,
152
+ "gemm_allreduce_plugin": null,
153
+ "context_fmha": false,
154
+ "bert_context_fmha_fp32_acc": false,
155
+ "paged_kv_cache": true,
156
+ "remove_input_padding": true,
157
+ "norm_quant_fusion": false,
158
+ "reduce_fusion": false,
159
+ "user_buffer": false,
160
+ "tokens_per_block": 32,
161
+ "use_paged_context_fmha": false,
162
+ "use_fp8_context_fmha": false,
163
+ "fuse_fp4_quant": false,
164
+ "multiple_profiles": false,
165
+ "paged_state": false,
166
+ "streamingllm": false,
167
+ "manage_weights": false,
168
+ "use_fused_mlp": true,
169
+ "pp_reduce_scatter": false
170
+ },
171
+ "use_strip_plan": false,
172
+ "max_encoder_input_len": 512,
173
+ "monitor_memory": false,
174
+ "use_mrope": false
175
+ }
176
+ }
int8_wo_cpu/1-gpu/decoder/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9211b48bf602aea503042061f57ebd19ba4526661ccaf791efbb551c469f7edd
3
+ size 548152340
int8_wo_cpu/1-gpu/encoder/config.json ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.3.0rc8",
3
+ "pretrained_config": {
4
+ "architecture": "EncoderModel",
5
+ "dtype": "float16",
6
+ "vocab_size": 32128,
7
+ "hidden_size": 1024,
8
+ "num_hidden_layers": 24,
9
+ "num_attention_heads": 16,
10
+ "hidden_act": "gelu",
11
+ "logits_dtype": "float16",
12
+ "norm_epsilon": 1e-06,
13
+ "runtime_defaults": null,
14
+ "position_embedding_type": "relative",
15
+ "num_key_value_heads": 16,
16
+ "intermediate_size": 2816,
17
+ "max_position_embeddings": 512,
18
+ "mapping": {
19
+ "world_size": 1,
20
+ "gpus_per_node": 8,
21
+ "cp_size": 1,
22
+ "tp_size": 1,
23
+ "pp_size": 1,
24
+ "moe_tp_size": 1,
25
+ "moe_cluster_size": 1,
26
+ "moe_ep_size": 1,
27
+ "attn_tp_size": 1,
28
+ "attn_cp_size": 1,
29
+ "cp_config": {},
30
+ "enable_attention_dp": false,
31
+ "enable_lm_head_tp_in_adp": false
32
+ },
33
+ "quantization": {
34
+ "quant_algo": "W8A16",
35
+ "kv_cache_quant_algo": null,
36
+ "group_size": 128,
37
+ "smoothquant_val": 0.5,
38
+ "clamp_val": null,
39
+ "use_meta_recipe": false,
40
+ "has_zero_point": false,
41
+ "pre_quant_scale": false,
42
+ "exclude_modules": null,
43
+ "mamba_ssm_cache_dtype": null,
44
+ "mamba_ssm_stochastic_rounding": false,
45
+ "mamba_ssm_philox_rounds": 10
46
+ },
47
+ "use_parallel_embedding": true,
48
+ "embedding_sharding_dim": 0,
49
+ "head_size": 64,
50
+ "qk_layernorm": false,
51
+ "rotary_embedding_dim": 64,
52
+ "producer": {
53
+ "name": "modelopt",
54
+ "version": "0.37.0"
55
+ },
56
+ "share_embedding_table": false,
57
+ "residual_mlp": false,
58
+ "bias": false,
59
+ "rotary_pct": 1.0,
60
+ "rank": 0,
61
+ "decoder": "t5",
62
+ "rmsnorm": true,
63
+ "lm_head_bias": false,
64
+ "has_position_embedding": false,
65
+ "layernorm_type": 1,
66
+ "has_attention_qkvo_bias": false,
67
+ "has_mlp_bias": false,
68
+ "has_model_final_layernorm": true,
69
+ "mlp_type": 1,
70
+ "use_prompt_tuning": false,
71
+ "has_embedding_layernorm": false,
72
+ "has_embedding_scale": false,
73
+ "ffn_hidden_size": 2816,
74
+ "q_scaling": 0.125,
75
+ "layernorm_position": 0,
76
+ "relative_attention": true,
77
+ "max_distance": 128,
78
+ "num_buckets": 32,
79
+ "model_type": "t5",
80
+ "use_implicit_relative_attention": false,
81
+ "type_vocab_size": null,
82
+ "rescale_before_lm_head": false,
83
+ "encoder_hidden_size": null,
84
+ "encoder_num_heads": null,
85
+ "encoder_num_kv_heads": null,
86
+ "encoder_head_size": null,
87
+ "skip_cross_kv": false,
88
+ "residual_scaling": 1.0,
89
+ "has_lm_head_bias": false
90
+ },
91
+ "build_config": {
92
+ "max_input_len": 512,
93
+ "max_seq_len": 512,
94
+ "opt_batch_size": 8,
95
+ "max_batch_size": 4,
96
+ "max_beam_width": 1,
97
+ "max_num_tokens": 2048,
98
+ "opt_num_tokens": 4,
99
+ "max_prompt_embedding_table_size": 0,
100
+ "kv_cache_type": "paged",
101
+ "gather_context_logits": false,
102
+ "gather_generation_logits": false,
103
+ "strongly_typed": true,
104
+ "force_num_profiles": null,
105
+ "profiling_verbosity": "layer_names_only",
106
+ "enable_debug_output": false,
107
+ "max_draft_len": 0,
108
+ "speculative_decoding_mode": 1,
109
+ "use_refit": false,
110
+ "input_timing_cache": null,
111
+ "output_timing_cache": "model.cache",
112
+ "lora_config": {
113
+ "lora_dir": [],
114
+ "lora_ckpt_source": "hf",
115
+ "max_lora_rank": 64,
116
+ "lora_target_modules": [],
117
+ "trtllm_modules_to_hf_modules": {},
118
+ "max_loras": null,
119
+ "max_cpu_loras": null,
120
+ "swap_gate_up_proj_lora_b_weight": true
121
+ },
122
+ "weight_sparsity": false,
123
+ "weight_streaming": false,
124
+ "plugin_config": {
125
+ "dtype": "float16",
126
+ "bert_attention_plugin": "auto",
127
+ "gpt_attention_plugin": "auto",
128
+ "gemm_plugin": "auto",
129
+ "gemm_swiglu_plugin": null,
130
+ "fp8_rowwise_gemm_plugin": null,
131
+ "qserve_gemm_plugin": null,
132
+ "identity_plugin": null,
133
+ "nccl_plugin": null,
134
+ "lora_plugin": null,
135
+ "dora_plugin": false,
136
+ "weight_only_groupwise_quant_matmul_plugin": null,
137
+ "weight_only_quant_matmul_plugin": "float16",
138
+ "smooth_quant_plugins": true,
139
+ "smooth_quant_gemm_plugin": null,
140
+ "layernorm_quantization_plugin": null,
141
+ "rmsnorm_quantization_plugin": null,
142
+ "quantize_per_token_plugin": false,
143
+ "quantize_tensor_plugin": false,
144
+ "moe_plugin": null,
145
+ "mamba_conv1d_plugin": "auto",
146
+ "low_latency_gemm_plugin": null,
147
+ "low_latency_gemm_swiglu_plugin": null,
148
+ "gemm_allreduce_plugin": null,
149
+ "context_fmha": false,
150
+ "bert_context_fmha_fp32_acc": false,
151
+ "paged_kv_cache": true,
152
+ "remove_input_padding": true,
153
+ "norm_quant_fusion": false,
154
+ "reduce_fusion": false,
155
+ "user_buffer": false,
156
+ "tokens_per_block": 32,
157
+ "use_paged_context_fmha": false,
158
+ "use_fp8_context_fmha": false,
159
+ "fuse_fp4_quant": false,
160
+ "multiple_profiles": false,
161
+ "paged_state": false,
162
+ "streamingllm": false,
163
+ "manage_weights": false,
164
+ "use_fused_mlp": true,
165
+ "pp_reduce_scatter": false
166
+ },
167
+ "use_strip_plan": false,
168
+ "max_encoder_input_len": 1024,
169
+ "monitor_memory": false,
170
+ "use_mrope": false
171
+ }
172
+ }
int8_wo_cpu/1-gpu/encoder/rank0.engine ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4659fd2a79e7f37cc1b3c94a0167526f6ba149c610c56759aca5e7c019ceea07
3
+ size 378946836