| | import os |
| | import argparse |
| | |
| | |
| | |
| | |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--model_path", type=str, required=True, |
| | help="Path to the fine-tuned model/adapter to convert.") |
| | parser.add_argument("--save_path", type=str, required=True, |
| | help="Path to save the converted BF16 model.") |
| | parser.add_argument("--msl", type=int, default=8192, |
| | help="Maximum sequence length for the model.") |
| | parser.add_argument("--cuda_device", type=str, default="2", |
| | help="CUDA device index to use.") |
| | args = parser.parse_args() |
| |
|
| | |
| | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| | os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device |
| | import torch |
| | from unsloth import FastLanguageModel |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | def convert_and_save(): |
| | print(f"Loading model from: {args.model_path}") |
| | |
| | |
| | |
| | |
| | model, tokenizer = FastLanguageModel.from_pretrained( |
| | model_name=args.model_path, |
| | max_seq_length=args.msl, |
| | dtype=torch.bfloat16, |
| | load_in_4bit=False, |
| | ) |
| |
|
| | print(f"Saving merged BF16 model to: {args.save_path}") |
| | |
| | |
| | |
| | |
| | model.save_pretrained_merged( |
| | args.save_path, |
| | tokenizer, |
| | save_method="merged_16bit", |
| | ) |
| | |
| | print("Conversion complete. You can now use this path for vLLM or standard inference.") |
| |
|
| | if __name__ == "__main__": |
| | convert_and_save() |