# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


"""
A script to convert the BigCode StarCoder checkpoints from HuggingFace to Megatron GPTModel.
This script is hardcoded specifically for the StarCoder pretrained models only, and is not
generalisable to any other models.

This script will load and convert the model entirely on CPU for OOM safety, but it is
possible to initialize the model on GPU before the save down. You can do this by adding --cuda
parameter to this script call.

This script requires that you have downloaded the StarCoder checkpoint from HuggingFace.
This can be done using Git with the following command:
```bash
git clone https://huggingface.co/bigcode/starcoder
```
Note that downloading this particular checkpoint requires authentication with a HuggingFace token.

The script will generate a Megatron model with TP=1 and PP=1. If you need different TP/PP
values, then after running this script, please use the following script to set whatever
TP/PP configuration you want:
    NeMo/examples/nlp/language_modeling/megatron_change_num_partitions.py

This script also requires a baseline config file from which to override default parameters.
You can specify the location of this file using the -c argument. Please use the config below
to correctly configure creating GPT-2 model in Megatron:
    NeMo/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml


Here is an example usage command:
```python
python convert_starcoder_hf_to_nemo.py \
    --input_name_or_path /path/to/starcoder \
    --output_path /path/to/save.nemo
```
"""

import argparse
import os
from typing import Dict

import lightning.pytorch as pl
import torch
import yaml
from omegaconf import OmegaConf
from transformers import AutoConfig, AutoModelForCausalLM

from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from nemo.collections.nlp.parts.utils_funcs import load_state_dict_helper
from nemo.utils import logging


def convert_state_dict(state_dict: Dict[str, torch.Tensor], amp: bool = False):
    def get_new_key(old_key):
        if old_key == "transformer.wte.weight":
            return "embedding.word_embeddings.weight"
        if old_key == "transformer.wpe.weight":
            return "embedding.position_embeddings.weight"
        elif old_key.startswith("transformer.ln_f"):
            return old_key.replace("transformer.ln_f", "decoder.final_layernorm")
        elif old_key.startswith("lm_head"):
            return old_key.replace("lm_head", "output_layer")
        else:
            p1 = old_key.replace("transformer.h", "decoder.layers")
            p2 = p1.replace("ln_1.", "self_attention.linear_qkv.layer_norm_")
            p3 = p2.replace("attn.c_proj", "self_attention.linear_proj")
            p4 = p3.replace("attn.c_attn", "self_attention.linear_qkv")
            p5 = p4.replace("ln_2.", "mlp.linear_fc1.layer_norm_")
            p6 = p5.replace("c_fc", "linear_fc1")
            p7 = p6.replace("c_proj", "linear_fc2")
            return p7

    new_dict = {}
    prefix = "model.module." if amp else "model."

    for old_key, val in state_dict.items():
        new_key = get_new_key(old_key)
        new_key = prefix + new_key
        new_dict[new_key] = val

    return new_dict


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input_name_or_path",
        type=str,
        required=True,
        help="Path to Starcoder checkpoint from HuggingFace hub or local dir",
    )
    parser.add_argument("--output_path", type=str, required=True, help="Path to dir where to store output .nemo file")
    parser.add_argument(
        "--hparams_file",
        type=str,
        default=os.path.join(
            os.path.dirname(__file__), '../../examples/nlp/language_modeling/conf/megatron_gpt_config.yaml'
        ),
        required=False,
        help="Path config for restoring. It's created during training and may need to be modified during restore if restore environment is different than training. Ex: /raid/nemo_experiments/megatron_gpt/hparams.yaml",
    )
    parser.add_argument(
        "--precision", type=str, default="bf16", choices=["bf16", "32"], help="Precision for checkpoint weights saved"
    )
    parser.add_argument("--cuda", action="store_true", help="Put Nemo model onto GPU prior to saving")
    args = parser.parse_args()

    if not os.path.isdir(args.output_path):
        raise FileNotFoundError(f"Output directory '{args.output_path}' does not exist")

    hf_config = AutoConfig.from_pretrained(args.input_name_or_path)

    with open(args.hparams_file, "r", encoding="utf_8") as f:
        orig_cfg = yaml.safe_load(f)

    model_dict = orig_cfg["model"]

    if "data" in model_dict:
        del model_dict["data"]

    override_model_dict = {
        "micro_batch_size": 1,
        "global_batch_size": 1,
        "tensor_model_parallel_size": 1,
        "pipeline_model_parallel_size": 1,
        "megatron_amp_O2": False,
        "transformer_engine": True,
        "use_cpu_initialization": not args.cuda,
        "normalization": "layernorm",
        "mcore_gpt": True,
        "num_query_groups": 1,  # MQA
        "hidden_size": hf_config.n_embd,
        "encoder_seq_length": hf_config.n_positions,
        "max_position_embeddings": hf_config.n_positions,
        "num_layers": hf_config.n_layer,
        "num_attention_heads": hf_config.n_head,
        "ffn_hidden_size": hf_config.n_inner,
        "layernorm_epsilon": hf_config.layer_norm_epsilon,
        "pre_process": True,
        "post_process": True,
        "apply_query_key_layer_scaling": True,
        "bias": True,
        "transformer_block_type": "pre_ln",
        "fp32_residual_connection": False,
        "hidden_dropout": hf_config.summary_first_dropout,
        "attention_dropout": hf_config.attn_pdrop,
        "ffn_dropout": 0,
        "share_embeddings_and_output_weights": False,
        "position_embedding_type": "learned_absolute",
        "normalize_attention_scores": True,
        "precision": args.precision,
    }
    tokenizer_dict = {
        "library": "huggingface",
        "type": args.input_name_or_path,
        "use_fast": True,
    }
    trainer_dict = {
        "devices": 1,
        "num_nodes": 1,
        "accelerator": "gpu" if args.cuda else "cpu",
        "precision": args.precision,
        "logger": False,
        "enable_checkpointing": False,
        "max_epochs": -1,
        "max_steps": 100000,
        "log_every_n_steps": 10,
        "val_check_interval": 100,
        "limit_val_batches": 50,
        "limit_test_batches": 500,
        "accumulate_grad_batches": 1,
        "gradient_clip_val": 1.0,
        "benchmark": False,
        "enable_model_summary": False,
        "strategy": NLPDDPStrategy(),
    }

    model_dict.update(override_model_dict)
    model_dict["tokenizer"] = tokenizer_dict

    omega_cfg = OmegaConf.create(model_dict)

    trainer = pl.Trainer(**trainer_dict)

    logging.info("Loading HuggingFace model...")
    model_hf = AutoModelForCausalLM.from_pretrained(args.input_name_or_path)
    logging.info(f"Loaded model:\n{model_hf}")

    state_dict_hf = model_hf.state_dict()
    convert_dict = convert_state_dict(state_dict_hf, amp=omega_cfg.megatron_amp_O2)

    logging.info("Creating Megatron model...")
    omega_cfg.cpu_offloading_num_layers = 0
    model = load_state_dict_helper(MegatronGPTModel, omega_cfg, trainer, convert_dict)
    logging.info(f"Created model:\n{model}")

    logging.info("Saving model...")
    # We make sure that the tokenizer can be instantiated later regardless of args.input_name_or_path
    model.cfg.tokenizer.update(type="bigcode/starcoder")
    dtype = torch.bfloat16 if args.precision == "bf16" else torch.float32
    model = model.to(dtype=dtype)
    model.cfg.update(use_cpu_initialization=False)
    model.save_to(os.path.join(args.output_path, "megatron_starcoder_tp1_pp1.nemo"))
    logging.info("Done.")