File size: 2,455 Bytes
1db7196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import argparse
# python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \
# --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1
# --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged 
# --cuda_device 2
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, required=True,
                    help="Path to the fine-tuned model/adapter to convert.")
parser.add_argument("--save_path", type=str, required=True,
                    help="Path to save the converted BF16 model.")
parser.add_argument("--msl", type=int, default=8192,
                    help="Maximum sequence length for the model.")
parser.add_argument("--cuda_device", type=str, default="2",
                    help="CUDA device index to use.")
args = parser.parse_args()

# Set your GPU visibility as you did in your script
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
import torch
from unsloth import FastLanguageModel

# -----------------------------
#  CONFIGURATION
# -----------------------------
# Path to your current fine-tuned model/adapter
# MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"

# Path where you want to save the BF16 version
# SAVE_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged"

def convert_and_save():
    print(f"Loading model from: {args.model_path}")
    
    # 1. Load the model
    # We explicitly set dtype=torch.bfloat16 to ensure the base is loaded correctly
    # load_in_4bit must be False to allow for a clean 16-bit merge
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model_path,
        max_seq_length=args.msl,
        dtype=torch.bfloat16, 
        load_in_4bit=False,
    )

    print(f"Saving merged BF16 model to: {args.save_path}")
    
    # 2. Save using save_pretrained_merged
    # 'merged_16bit' will save as float16 or bfloat16 depending on the loaded dtype.
    # Since we loaded with torch.bfloat16, this will save in bfloat16.
    model.save_pretrained_merged(
        args.save_path,
        tokenizer,
        save_method="merged_16bit",
    )
    
    print("Conversion complete. You can now use this path for vLLM or standard inference.")

if __name__ == "__main__":
    convert_and_save()