mshahidul

Initial commit of readCtrl code without large models

030876e 6 days ago

2.46 kB

	import os
	import argparse
	# python /home/mshahidul/readctrl/code/finetune-inference/convert_fp16.py \
	# --model_path /home/mshahidul/readctrl_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1
	# --save_path /home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims-attribution_resonability_check_8kCtx_v1_BF16_merged
	# --cuda_device 2
	parser = argparse.ArgumentParser()
	parser.add_argument("--model_path", type=str, required=True,
	help="Path to the fine-tuned model/adapter to convert.")
	parser.add_argument("--save_path", type=str, required=True,
	help="Path to save the converted BF16 model.")
	parser.add_argument("--msl", type=int, default=8192,
	help="Maximum sequence length for the model.")
	parser.add_argument("--cuda_device", type=str, default="2",
	help="CUDA device index to use.")
	args = parser.parse_args()

	# Set your GPU visibility as you did in your script
	os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
	os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda_device
	import torch
	from unsloth import FastLanguageModel

	# -----------------------------
	# CONFIGURATION
	# -----------------------------
	# Path to your current fine-tuned model/adapter
	# MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-32B_subclaims-extraction-8b_ctx"

	# Path where you want to save the BF16 version
	# SAVE_PATH = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged"

	def convert_and_save():
	print(f"Loading model from: {args.model_path}")

	# 1. Load the model
	# We explicitly set dtype=torch.bfloat16 to ensure the base is loaded correctly
	# load_in_4bit must be False to allow for a clean 16-bit merge
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name=args.model_path,
	max_seq_length=args.msl,
	dtype=torch.bfloat16,
	load_in_4bit=False,
	)

	print(f"Saving merged BF16 model to: {args.save_path}")

	# 2. Save using save_pretrained_merged
	# 'merged_16bit' will save as float16 or bfloat16 depending on the loaded dtype.
	# Since we loaded with torch.bfloat16, this will save in bfloat16.
	model.save_pretrained_merged(
	args.save_path,
	tokenizer,
	save_method="merged_16bit",
	)

	print("Conversion complete. You can now use this path for vLLM or standard inference.")

	if __name__ == "__main__":
	convert_and_save()