| import os | |
| os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "1" | |
| from unsloth import FastLanguageModel | |
| # Path to your finetuned model directory | |
| MODEL_PATH = "/home/mshahidul/readctrl_model/qwen3-8B_subclaims-verifier_lora_nonreasoning" | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name = MODEL_PATH, | |
| max_seq_length = 8192, | |
| load_in_4bit = False, | |
| load_in_8bit = False, | |
| ) | |
| # Save merged 4-bit model for vLLM | |
| SAVE_PATH = "/home/mshahidul/readctrl_model/support_checking_vllm" | |
| model.save_pretrained_merged(SAVE_PATH, tokenizer, save_method = "merged_16bit") | |