File size: 2,455 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | import os
import argparse
# python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \
# --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1
# --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged
# --cuda_device 2
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, required=True,
help="Path to the fine-tuned model/adapter to convert.")
parser.add_argument("--save_path", type=str, required=True,
help="Path to save the converted BF16 model.")
parser.add_argument("--msl", type=int, default=8192,
help="Maximum sequence length for the model.")
parser.add_argument("--cuda_device", type=str, default="2",
help="CUDA device index to use.")
args = parser.parse_args()
# Set your GPU visibility as you did in your script
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
import torch
from unsloth import FastLanguageModel
# -----------------------------
# CONFIGURATION
# -----------------------------
# Path to your current fine-tuned model/adapter
# MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"
# Path where you want to save the BF16 version
# SAVE_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged"
def convert_and_save():
print(f"Loading model from: {args.model_path}")
# 1. Load the model
# We explicitly set dtype=torch.bfloat16 to ensure the base is loaded correctly
# load_in_4bit must be False to allow for a clean 16-bit merge
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model_path,
max_seq_length=args.msl,
dtype=torch.bfloat16,
load_in_4bit=False,
)
print(f"Saving merged BF16 model to: {args.save_path}")
# 2. Save using save_pretrained_merged
# 'merged_16bit' will save as float16 or bfloat16 depending on the loaded dtype.
# Since we loaded with torch.bfloat16, this will save in bfloat16.
model.save_pretrained_merged(
args.save_path,
tokenizer,
save_method="merged_16bit",
)
print("Conversion complete. You can now use this path for vLLM or standard inference.")
if __name__ == "__main__":
convert_and_save() |