import os import argparse # python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \ # --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1 # --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged # --cuda_device 2 parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True, help="Path to the fine-tuned model/adapter to convert.") parser.add_argument("--save_path", type=str, required=True, help="Path to save the converted BF16 model.") parser.add_argument("--msl", type=int, default=8192, help="Maximum sequence length for the model.") parser.add_argument("--cuda_device", type=str, default="2", help="CUDA device index to use.") args = parser.parse_args() # Set your GPU visibility as you did in your script os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device import torch from unsloth import FastLanguageModel # ----------------------------- # CONFIGURATION # ----------------------------- # Path to your current fine-tuned model/adapter # MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx" # Path where you want to save the BF16 version # SAVE_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged" def convert_and_save(): print(f"Loading model from: {args.model_path}") # 1. Load the model # We explicitly set dtype=torch.bfloat16 to ensure the base is loaded correctly # load_in_4bit must be False to allow for a clean 16-bit merge model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.model_path, max_seq_length=args.msl, dtype=torch.bfloat16, load_in_4bit=False, ) print(f"Saving merged BF16 model to: {args.save_path}") # 2. Save using save_pretrained_merged # 'merged_16bit' will save as float16 or bfloat16 depending on the loaded dtype. # Since we loaded with torch.bfloat16, this will save in bfloat16. model.save_pretrained_merged( args.save_path, tokenizer, save_method="merged_16bit", ) print("Conversion complete. You can now use this path for vLLM or standard inference.") if __name__ == "__main__": convert_and_save()