"""Merge 7B LoRA adapter with FP16 base (NOT quantized) and push.""" import os, torch, gc from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer from huggingface_hub import login HF_TOKEN = os.environ["HF_TOKEN"] login(token=HF_TOKEN) print("Loading Qwen 7B FP16 on CPU...") base = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-7B-Instruct", torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True, ) print("Loading adapter...") model = PeftModel.from_pretrained(base, "devsomosahub/agent-os-adapter-7b") print("Merging...") merged = model.merge_and_unload() tok = AutoTokenizer.from_pretrained("devsomosahub/agent-os-adapter-7b", trust_remote_code=True) print("Pushing merged 7B to Hub...") merged.push_to_hub("devsomosahub/agent-os-7b-merged", token=HF_TOKEN, max_shard_size="2GB") tok.push_to_hub("devsomosahub/agent-os-7b-merged", token=HF_TOKEN) print("DONE! https://huggingface.co/devsomosahub/agent-os-7b-merged")