|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
A script to convert the BigCode StarCoder checkpoints from HuggingFace to Megatron GPTModel. |
|
|
This script is hardcoded specifically for the StarCoder pretrained models only, and is not |
|
|
generalisable to any other models. |
|
|
|
|
|
This script will load and convert the model entirely on CPU for OOM safety, but it is |
|
|
possible to initialize the model on GPU before the save down. You can do this by adding --cuda |
|
|
parameter to this script call. |
|
|
|
|
|
This script requires that you have downloaded the StarCoder checkpoint from HuggingFace. |
|
|
This can be done using Git with the following command: |
|
|
```bash |
|
|
git clone https://huggingface.co/bigcode/starcoder |
|
|
``` |
|
|
Note that downloading this particular checkpoint requires authentication with a HuggingFace token. |
|
|
|
|
|
The script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP |
|
|
values, then after running this script, please use the following script to set whatever |
|
|
TP/PP configuration you want: |
|
|
NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py |
|
|
|
|
|
This script also requires a baseline config file from which to override default parameters. |
|
|
You can specify the location of this file using the -c argument. Please use the config below |
|
|
to correctly configure creating GPT-2 model in Megatron: |
|
|
NeMo/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml |
|
|
|
|
|
|
|
|
Here is an example usage command: |
|
|
```python |
|
|
python convert_starcoder_hf_to_nemo.py \ |
|
|
--input_name_or_path /path/to/starcoder \ |
|
|
--output_path /path/to/save.nemo |
|
|
``` |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
from typing import Dict |
|
|
|
|
|
import lightning.pytorch as pl |
|
|
import torch |
|
|
import yaml |
|
|
from omegaconf import OmegaConf |
|
|
from transformers import AutoConfig, AutoModelForCausalLM |
|
|
|
|
|
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel |
|
|
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy |
|
|
from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper |
|
|
from nemo.utils import logging |
|
|
|
|
|
|
|
|
def convert_state_dict(state_dict: Dict[str, torch.Tensor], amp: bool = False): |
|
|
def get_new_key(old_key): |
|
|
if old_key == "transformer.wte.weight": |
|
|
return "embedding.word_embeddings.weight" |
|
|
if old_key == "transformer.wpe.weight": |
|
|
return "embedding.position_embeddings.weight" |
|
|
elif old_key.startswith("transformer.ln_f"): |
|
|
return old_key.replace("transformer.ln_f", "decoder.final_layernorm") |
|
|
elif old_key.startswith("lm_head"): |
|
|
return old_key.replace("lm_head", "output_layer") |
|
|
else: |
|
|
p1 = old_key.replace("transformer.h", "decoder.layers") |
|
|
p2 = p1.replace("ln_1.", "self_attention.linear_qkv.layer_norm_") |
|
|
p3 = p2.replace("attn.c_proj", "self_attention.linear_proj") |
|
|
p4 = p3.replace("attn.c_attn", "self_attention.linear_qkv") |
|
|
p5 = p4.replace("ln_2.", "mlp.linear_fc1.layer_norm_") |
|
|
p6 = p5.replace("c_fc", "linear_fc1") |
|
|
p7 = p6.replace("c_proj", "linear_fc2") |
|
|
return p7 |
|
|
|
|
|
new_dict = {} |
|
|
prefix = "model.module." if amp else "model." |
|
|
|
|
|
for old_key, val in state_dict.items(): |
|
|
new_key = get_new_key(old_key) |
|
|
new_key = prefix + new_key |
|
|
new_dict[new_key] = val |
|
|
|
|
|
return new_dict |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--input_name_or_path", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Path to Starcoder checkpoint from HuggingFace hub or local dir", |
|
|
) |
|
|
parser.add_argument("--output_path", type=str, required=True, help="Path to dir where to store output .nemo file") |
|
|
parser.add_argument( |
|
|
"--hparams_file", |
|
|
type=str, |
|
|
default=os.path.join( |
|
|
os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml' |
|
|
), |
|
|
required=False, |
|
|
help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved" |
|
|
) |
|
|
parser.add_argument("--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving") |
|
|
args = parser.parse_args() |
|
|
|
|
|
if not os.path.isdir(args.output_path): |
|
|
raise FileNotFoundError(f"Output directory '{args.output_path}' does not exist") |
|
|
|
|
|
hf_config = AutoConfig.from_pretrained(args.input_name_or_path) |
|
|
|
|
|
with open(args.hparams_file, "r", encoding="utf_8") as f: |
|
|
orig_cfg = yaml.safe_load(f) |
|
|
|
|
|
model_dict = orig_cfg["model"] |
|
|
|
|
|
if "data" in model_dict: |
|
|
del model_dict["data"] |
|
|
|
|
|
override_model_dict = { |
|
|
"micro_batch_size": 1, |
|
|
"global_batch_size": 1, |
|
|
"tensor_model_parallel_size": 1, |
|
|
"pipeline_model_parallel_size": 1, |
|
|
"megatron_amp_O2": False, |
|
|
"transformer_engine": True, |
|
|
"use_cpu_initialization": not args.cuda, |
|
|
"normalization": "layernorm", |
|
|
"mcore_gpt": True, |
|
|
"num_query_groups": 1, |
|
|
"hidden_size": hf_config.n_embd, |
|
|
"encoder_seq_length": hf_config.n_positions, |
|
|
"max_position_embeddings": hf_config.n_positions, |
|
|
"num_layers": hf_config.n_layer, |
|
|
"num_attention_heads": hf_config.n_head, |
|
|
"ffn_hidden_size": hf_config.n_inner, |
|
|
"layernorm_epsilon": hf_config.layer_norm_epsilon, |
|
|
"pre_process": True, |
|
|
"post_process": True, |
|
|
"apply_query_key_layer_scaling": True, |
|
|
"bias": True, |
|
|
"transformer_block_type": "pre_ln", |
|
|
"fp32_residual_connection": False, |
|
|
"hidden_dropout": hf_config.summary_first_dropout, |
|
|
"attention_dropout": hf_config.attn_pdrop, |
|
|
"ffn_dropout": 0, |
|
|
"share_embeddings_and_output_weights": False, |
|
|
"position_embedding_type": "learned_absolute", |
|
|
"normalize_attention_scores": True, |
|
|
"precision": args.precision, |
|
|
} |
|
|
tokenizer_dict = { |
|
|
"library": "huggingface", |
|
|
"type": args.input_name_or_path, |
|
|
"use_fast": True, |
|
|
} |
|
|
trainer_dict = { |
|
|
"devices": 1, |
|
|
"num_nodes": 1, |
|
|
"accelerator": "gpu" if args.cuda else "cpu", |
|
|
"precision": args.precision, |
|
|
"logger": False, |
|
|
"enable_checkpointing": False, |
|
|
"max_epochs": -1, |
|
|
"max_steps": 100000, |
|
|
"log_every_n_steps": 10, |
|
|
"val_check_interval": 100, |
|
|
"limit_val_batches": 50, |
|
|
"limit_test_batches": 500, |
|
|
"accumulate_grad_batches": 1, |
|
|
"gradient_clip_val": 1.0, |
|
|
"benchmark": False, |
|
|
"enable_model_summary": False, |
|
|
"strategy": NLPDDPStrategy(), |
|
|
} |
|
|
|
|
|
model_dict.update(override_model_dict) |
|
|
model_dict["tokenizer"] = tokenizer_dict |
|
|
|
|
|
omega_cfg = OmegaConf.create(model_dict) |
|
|
|
|
|
trainer = pl.Trainer(**trainer_dict) |
|
|
|
|
|
logging.info("Loading HuggingFace model...") |
|
|
model_hf = AutoModelForCausalLM.from_pretrained(args.input_name_or_path) |
|
|
logging.info(f"Loaded model:\n{model_hf}") |
|
|
|
|
|
state_dict_hf = model_hf.state_dict() |
|
|
convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2) |
|
|
|
|
|
logging.info("Creating Megatron model...") |
|
|
omega_cfg.cpu_offloading_num_layers = 0 |
|
|
model = load_state_dict_helper(MegatronGPTModel, omega_cfg, trainer, convert_dict) |
|
|
logging.info(f"Created model:\n{model}") |
|
|
|
|
|
logging.info("Saving model...") |
|
|
|
|
|
model.cfg.tokenizer.update(type="bigcode/starcoder") |
|
|
dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32 |
|
|
model = model.to(dtype=dtype) |
|
|
model.cfg.update(use_cpu_initialization=False) |
|
|
model.save_to(os.path.join(args.output_path, "megatron_starcoder_tp1_pp1.nemo")) |
|
|
logging.info("Done.") |
|
|
|