| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
|
|
| import fire |
| import torch |
| from mcore_adapter.models.converter.post_converter import convert_checkpoint_to_hf, convert_checkpoint_to_mca |
| from mcore_adapter.training_args import DistributingParallelArguments |
| from mcore_adapter.utils import get_logger |
| from transformers import AutoConfig |
|
|
|
|
| logger = get_logger(__name__) |
|
|
|
|
| def convert_mca_to_hf( |
| checkpoint_path: str, |
| output_path: str = "./output", |
| bf16: bool = False, |
| fp16: bool = False, |
| convert_model_max_length: int | None = None, |
| ): |
| """Convert megatron checkpoint to HuggingFace format. |
| |
| Args: |
| checkpoint_path: Path to the checkpoint to convert |
| output_path: Path to save the converted checkpoint |
| bf16: Use bfloat16 precision |
| fp16: Use float16 precision |
| convert_model_max_length: Change the model_max_length in hf config.json |
| """ |
| if bf16 and fp16: |
| raise ValueError("bf16 and fp16 cannot be both True.") |
|
|
| torch_dtype = None |
| if bf16: |
| torch_dtype = torch.bfloat16 |
| elif fp16: |
| torch_dtype = torch.float16 |
|
|
| convert_checkpoint_to_hf(checkpoint_path, output_path, torch_dtype=torch_dtype) |
|
|
| if convert_model_max_length is not None: |
| config = AutoConfig.from_pretrained(output_path, trust_remote_code=True) |
| config.model_max_length = convert_model_max_length |
| config.save_pretrained(output_path) |
|
|
|
|
| def convert( |
| checkpoint_path: str, |
| output_path: str = "./output", |
| bf16: bool = False, |
| fp16: bool = False, |
| convert_model_max_length: int | None = None, |
| tensor_model_parallel_size: int = 1, |
| pipeline_model_parallel_size: int = 1, |
| expert_model_parallel_size: int = 1, |
| virtual_pipeline_model_parallel_size: int | None = None, |
| ): |
| """Convert checkpoint between MCA and HuggingFace formats. |
| |
| Args: |
| checkpoint_path: Path to the checkpoint to convert |
| output_path: Path to save the converted checkpoint |
| bf16: Use bfloat16 precision |
| fp16: Use float16 precision |
| convert_model_max_length: Change the model_max_length in hf config.json |
| tensor_model_parallel_size: Tensor model parallel size |
| pipeline_model_parallel_size: Pipeline model parallel size |
| expert_model_parallel_size: Expert model parallel size |
| virtual_pipeline_model_parallel_size: Virtual pipeline model parallel size |
| """ |
| if bf16 and fp16: |
| raise ValueError("bf16 and fp16 cannot be both True.") |
|
|
| mca_config_path = os.path.join(checkpoint_path, "mca_config.json") |
| from_mca = os.path.exists(mca_config_path) |
|
|
| if not from_mca: |
| dist_args = DistributingParallelArguments( |
| tensor_model_parallel_size=tensor_model_parallel_size, |
| pipeline_model_parallel_size=pipeline_model_parallel_size, |
| expert_model_parallel_size=expert_model_parallel_size, |
| virtual_pipeline_model_parallel_size=virtual_pipeline_model_parallel_size, |
| ) |
|
|
| convert_checkpoint_to_mca( |
| checkpoint_path, |
| output_path, |
| dist_args, |
| bf16=bf16, |
| fp16=fp16, |
| ) |
| else: |
| convert_mca_to_hf( |
| checkpoint_path=checkpoint_path, |
| output_path=output_path, |
| bf16=bf16, |
| fp16=fp16, |
| convert_model_max_length=convert_model_max_length, |
| ) |
|
|
|
|
| def main(): |
| fire.Fire(convert) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|