| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| from argparse import ArgumentParser |
| from collections import OrderedDict |
|
|
| import torch |
| from omegaconf import open_dict |
| from pytorch_lightning import Trainer |
| from transformers import AutoModelForCausalLM, GemmaTokenizer, GemmaTokenizerFast, convert_slow_tokenizer |
|
|
| from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel |
| from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids |
| from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy |
| from nemo.utils import logging |
|
|
| """ |
| Script to convert a gemma checkpoint in nemo (mcore path) into a HuggingFace checkpoint. |
| This script can be used to 1) generate only the HF weights, or 2) generate an entire HF model folder. |
| This script is adapted from convert_llama_nemo_to_hf.py |
| |
| 1) Generate only HF weights from a nemo file: |
| |
| python convert_gemma_nemo_to_hf.py \ |
| --input_name_or_path /workspace/pretrained/HF_TO_NEMO/gemma-2b-it \ |
| --output_path /workspace/pretrained/NEMO_TO_HF/gemma-2b-it/pytorch_model.bin |
| |
| 2) Generate the full HF model folder |
| |
| python convert_gemma_nemo_to_hf.py \ |
| --input_name_or_path /workspace/pretrained/HF_TO_NEMO/gemma-2b-it \ |
| --output_path /workspace/pretrained/NEMO_TO_HF/gemma-2b-it/pytorch_model.bin \ |
| --hf_input_path /workspace/pretrained/HF_MODELS/gemma-2b-it \ |
| --hf_output_path /workspace/pretrained/NEMO_TO_HF/gemma-2b-it \ |
| --input_tokenizer /workspace/pretrained/HF_MODELS/gemma-2b-it \ |
| --hf_output_tokenizer /workspace/pretrained/NEMO_TO_HF/gemma-2b-it \ |
| --precision 32 |
| |
| Use the --cpu-only flag if the model cannot fit in the GPU (e.g. Llama2 70b). |
| However this option makes the conversion script significantly slower. |
| """ |
|
|
|
|
| def get_args(): |
| parser = ArgumentParser() |
| parser.add_argument( |
| "--input_name_or_path", |
| type=str, |
| default=None, |
| required=True, |
| help="Path to .nemo file or extracted folder", |
| ) |
| parser.add_argument("--output_path", type=str, default=None, required=True, help="Path to HF .bin file") |
| parser.add_argument( |
| "--hf_input_path", |
| type=str, |
| default=None, |
| help="A HF model path, " "e.g. a folder containing https://huggingface.co/meta-llama/Llama-2-7b-hf/tree/main", |
| ) |
| parser.add_argument( |
| "--hf_output_path", |
| type=str, |
| default=None, |
| help="Output HF model path, " "with the same format as above but user's own weights", |
| ) |
| parser.add_argument( |
| "--input_tokenizer", |
| type=str, |
| default=None, |
| help="Path to tokenizer used for the input nemo model. (need to extract the .nemo file first)", |
| ) |
| parser.add_argument( |
| "--hf_output_tokenizer", |
| type=str, |
| default=None, |
| help="Path to save the tokenizer used for the output HF model.", |
| ) |
| parser.add_argument( |
| "--precision", |
| type=str, |
| default=None, |
| help="Precision of output weights." |
| "Defaults to precision of the input nemo weights (model.cfg.trainer.precision)", |
| ) |
| parser.add_argument( |
| "--cpu-only", |
| action="store_true", |
| help="Load model in cpu only. Useful if the model cannot fit in GPU memory, " |
| "but this option makes the conversion script significantly slower.", |
| ) |
| args = parser.parse_args() |
| return args |
|
|
|
|
| def verify_forward(model_path, tokenizer_path, model_string): |
| logging.info(f"=" * 100) |
| logging.info(f"Verifying forward pass for {model_string}") |
|
|
| input_texts = [ |
| 'query: how much protein should an adult eat', |
| ] |
| logging.info(f"Running verifications {input_texts} ...") |
|
|
| tokenizer = GemmaTokenizer.from_pretrained(tokenizer_path, local_files_only=True) |
| tokenizer.pad_token = tokenizer.eos_token |
| batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors="pt") |
| batch_dict_cuda = {k: v.cuda() for k, v in batch_dict.items()} |
|
|
| if model_string == "hf": |
| model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True) |
| model = model.cuda().eval() |
| outputs = model(**batch_dict_cuda, output_hidden_states=True) |
| next_token = outputs.logits[0, -1].argmax() |
| elif model_string == 'nemo': |
| dummy_trainer = Trainer(devices=1, accelerator='auto', strategy=NLPDDPStrategy()) |
| model_config = MegatronGPTModel.restore_from(model_path, trainer=dummy_trainer, return_config=True) |
| model_config.tensor_model_parallel_size = 1 |
| model_config.pipeline_model_parallel_size = 1 |
| model = MegatronGPTModel.restore_from( |
| model_path, trainer=dummy_trainer, override_config_path=model_config, map_location=None |
| ) |
|
|
| ids = batch_dict_cuda['input_ids'] |
| id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids.cpu()] |
| masks_and_position_ids = [ |
| get_ltor_masks_and_position_ids(id_tensor, tokenizer.eos_token, False, False, False) |
| for id_tensor in id_tensors |
| ] |
|
|
| for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): |
| attn_mask, _, pos_ids = attn_mask_and_pos_ids |
|
|
| outputs = model( |
| tokens=tokens, text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None |
| ) |
| next_token = outputs.squeeze()[-1].argmax() |
| else: |
| raise ValueError(f"Model string {model_string} not recognized.") |
|
|
| logging.info(f"{model_string} predicted next token is: '{tokenizer.convert_ids_to_tokens([next_token])}'.") |
| logging.info(f"=" * 100) |
|
|
|
|
| def convert(input_nemo_file, output_hf_file, precision=None, cpu_only=False) -> None: |
| """ |
| Convert NeMo weights to HF weights |
| """ |
| dummy_trainer = Trainer(devices=1, accelerator='cpu', strategy=NLPDDPStrategy()) |
| model_config = MegatronGPTModel.restore_from(input_nemo_file, trainer=dummy_trainer, return_config=True) |
| model_config.tensor_model_parallel_size = 1 |
| model_config.pipeline_model_parallel_size = 1 |
| if cpu_only: |
| map_location = torch.device('cpu') |
| model_config.use_cpu_initialization = True |
| else: |
| map_location = None |
|
|
| if cpu_only: |
| logging.info("******** Loading model on CPU. This will take a significant amount of time.") |
| model = MegatronGPTModel.restore_from( |
| input_nemo_file, trainer=dummy_trainer, override_config_path=model_config, map_location=map_location |
| ) |
| if precision is None: |
| precision = model.cfg.precision |
| if precision in [32, "32"]: |
| dtype = torch.float32 |
| elif precision in [16, "16", "16-mixed"]: |
| dtype = torch.float16 |
| elif precision in ["bf16", "bf16-mixed"]: |
| dtype = torch.bfloat16 |
| else: |
| logging.warning(f"Precision string {precision} is not recognized, falling back to fp32") |
| dtype = torch.float32 |
| logging.info(f"Using precision {dtype}") |
|
|
| param_to_weights = lambda param: param.to(dtype) |
| checkpoint = OrderedDict() |
|
|
| hidden_size = model.cfg.hidden_size |
| head_num = model.cfg.num_attention_heads |
| num_layers = model.cfg.num_layers |
| ffn_hidden_size = model.cfg.ffn_hidden_size |
| num_query_groups = model.cfg.get("num_query_groups", head_num) |
|
|
| head_size = hidden_size // head_num |
| heads_per_group = head_num // num_query_groups |
| qkv_total_dim = head_num + 2 * num_query_groups |
|
|
| |
| embed_weight = model.state_dict()[f'model.embedding.word_embeddings.weight'] |
| embed_weights_base_name = f'model.embed_tokens.weight' |
| checkpoint[embed_weights_base_name] = param_to_weights(embed_weight) |
| for l in range(int(num_layers)): |
| print(f"converting layer {l}") |
|
|
| qkv_weights = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.weight'] |
| qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size]) |
|
|
| q_slice = torch.cat( |
| [ |
| torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group) |
| for i in range(num_query_groups) |
| ] |
| ) |
| k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2)) |
| v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2)) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| q_weights_base_name = f'model.layers.{l}.self_attn.q_proj.weight' |
| k_weights_base_name = f'model.layers.{l}.self_attn.k_proj.weight' |
| v_weights_base_name = f'model.layers.{l}.self_attn.v_proj.weight' |
|
|
| checkpoint[q_weights_base_name] = param_to_weights(qkv_weights[q_slice].reshape(-1, hidden_size)) |
| checkpoint[k_weights_base_name] = param_to_weights(qkv_weights[k_slice].reshape(-1, hidden_size)) |
| checkpoint[v_weights_base_name] = param_to_weights(qkv_weights[v_slice].reshape(-1, hidden_size)) |
|
|
| |
| o_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_proj.weight'] |
| o_weight_base_name = f'model.layers.{l}.self_attn.o_proj.weight' |
| checkpoint[o_weight_base_name] = param_to_weights(o_weight) |
|
|
| |
| mlp_weights = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.weight'] |
| mlp_down_proj_weight = mlp_weights[:ffn_hidden_size, :] |
| mlp_gate_proj_weight = mlp_weights[ffn_hidden_size:, :] |
|
|
| mlp_down_proj_base_name = f'model.layers.{l}.mlp.gate_proj.weight' |
| mlp_gate_proj_base_name = f'model.layers.{l}.mlp.up_proj.weight' |
|
|
| checkpoint[mlp_down_proj_base_name] = param_to_weights(mlp_down_proj_weight) |
| checkpoint[mlp_gate_proj_base_name] = param_to_weights(mlp_gate_proj_weight) |
|
|
| mlp_up_proj_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc2.weight'] |
| mlp_up_proj_base_name = f'model.layers.{l}.mlp.down_proj.weight' |
| checkpoint[mlp_up_proj_base_name] = param_to_weights(mlp_up_proj_weight) |
|
|
| |
| input_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.self_attention.linear_qkv.layer_norm_weight'] |
| input_ln_base_name = f'model.layers.{l}.input_layernorm.weight' |
| checkpoint[input_ln_base_name] = param_to_weights(input_ln_weight - 1.0) |
|
|
| post_attn_ln_weight = model.state_dict()[f'model.decoder.layers.{l}.mlp.linear_fc1.layer_norm_weight'] |
| post_attn_ln_base_name = f'model.layers.{l}.post_attention_layernorm.weight' |
| checkpoint[post_attn_ln_base_name] = param_to_weights(post_attn_ln_weight - 1.0) |
|
|
| print(f"done layer {l}") |
|
|
| final_ln_weight = model.state_dict()[f'model.decoder.final_layernorm.weight'] |
| final_ln_base_name = f'model.norm.weight' |
| checkpoint[final_ln_base_name] = param_to_weights(final_ln_weight - 1.0) |
|
|
| |
| output_layer_weight = model.state_dict()[ |
| f'model.embedding.word_embeddings.weight' |
| ] |
| output_layer_base_name = f'lm_head.weight' |
| checkpoint[output_layer_base_name] = param_to_weights(output_layer_weight) |
|
|
| os.makedirs(os.path.dirname(output_hf_file), exist_ok=True) |
| torch.save(checkpoint, output_hf_file) |
| logging.info(f"Weights saved to {output_hf_file}") |
|
|
| return dtype |
|
|
|
|
| def replace_hf_weights_and_tokenizer( |
| weights_file, |
| dtype, |
| input_hf_path, |
| output_hf_path, |
| tokenizer_path, |
| output_hf_tokenizer, |
| ): |
| model = AutoModelForCausalLM.from_pretrained( |
| input_hf_path, |
| local_files_only=True, |
| torch_dtype=dtype, |
| ) |
| nemo_exported = torch.load(weights_file) |
|
|
| if tokenizer_path: |
| tokenizer = GemmaTokenizer.from_pretrained( |
| tokenizer_path, |
| local_files_only=True, |
| legacy=False, |
| ) |
| tmp_tokenizer = convert_slow_tokenizer.convert_slow_tokenizer(tokenizer) |
| fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tmp_tokenizer) |
| tokenizer_length = len(fast_tokenizer) |
| model.resize_token_embeddings(tokenizer_length) |
|
|
| model.load_state_dict(nemo_exported) |
| model.save_pretrained(output_hf_path) |
| logging.info(f"Full HF model saved to {output_hf_path}") |
|
|
| if tokenizer_path: |
| fast_tokenizer.save_pretrained(output_hf_tokenizer) |
| tokenizer.save_pretrained(output_hf_tokenizer) |
| logging.info(f"Tokenizer saved to {output_hf_tokenizer}") |
|
|
|
|
| if __name__ == '__main__': |
| args = get_args() |
| if not args.hf_output_tokenizer and args.hf_output_path: |
| args.hf_output_tokenizer = args.hf_output_path |
| |
| if args.hf_input_path and args.hf_output_path: |
| """ |
| replace_hf_weights_and_tokenizer( |
| args.output_path, |
| dtype, |
| args.hf_input_path, |
| args.hf_output_path, |
| args.input_tokenizer, |
| args.hf_output_tokenizer, |
| ) |
| """ |
| verify_forward(args.input_name_or_path, args.hf_output_tokenizer, "nemo") |
| verify_forward(args.hf_output_path, args.hf_output_tokenizer, "hf") |
| else: |
| logging.info("`hf_input_path` and/or `hf_output_path` not provided, not generating full HF model.") |
| logging.info(f".bin file is saved to {args.output_path}") |
|
|