Spaces:
Running on Zero
Running on Zero
finetune: merge LoRA via streaming HF export (avoids 2x-model save peak)
Browse files
finetune/nemo/run_merge_hf.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Merge the Warden LoRA into the base and export DIRECTLY to HF safetensors.
|
| 3 |
+
|
| 4 |
+
Why not the stock merge_lora.py: its save_megatron_model stages a second
|
| 5 |
+
~63GB copy of the model during the torch_dist save — on a 121GB unified-memory
|
| 6 |
+
GB10 that's what tripped the watchdog (and previously froze spark-eee9).
|
| 7 |
+
AutoBridge.save_hf_pretrained instead streams tensor-by-tensor into sharded
|
| 8 |
+
safetensors (peak ≈ model + one shard) and auto-merges LoRA adapters on the
|
| 9 |
+
way out. It also skips the merged-Megatron intermediate entirely: HF format
|
| 10 |
+
is what the GGUF conversion and the ZeroGPU Space load anyway.
|
| 11 |
+
|
| 12 |
+
Load + adapter-patch steps mirror /opt/Megatron-Bridge/examples/peft/merge_lora.py.
|
| 13 |
+
"""
|
| 14 |
+
import multiprocessing as mp
|
| 15 |
+
|
| 16 |
+
mp.set_start_method("forkserver", force=True)
|
| 17 |
+
|
| 18 |
+
import os # noqa: E402
|
| 19 |
+
import sys # noqa: E402
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def main():
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import torch
|
| 26 |
+
from megatron.core import dist_checkpointing
|
| 27 |
+
from megatron.bridge.models.conversion.auto_bridge import AutoBridge
|
| 28 |
+
from megatron.bridge.peft.lora import LoRA
|
| 29 |
+
from megatron.bridge.training.checkpointing import (
|
| 30 |
+
_generate_model_state_dict,
|
| 31 |
+
apply_peft_adapter_filter_to_state_dict,
|
| 32 |
+
)
|
| 33 |
+
from megatron.bridge.training.utils.checkpoint_utils import read_run_config
|
| 34 |
+
|
| 35 |
+
lora_dir = Path(os.environ.get("LORA_CKPT", "/work/runs/warden-dora/checkpoints/iter_0000150"))
|
| 36 |
+
hf_dir = os.environ.get("HF_DIR", "/models/nemotron-3-nano-30b-bf16")
|
| 37 |
+
base_dir = os.environ.get("MEGATRON_CKPT", "/models/nemotron-3-nano-30b-megatron")
|
| 38 |
+
out_dir = os.environ.get("OUT_DIR", "/models/nemotron-3-nano-30b-warden-hf")
|
| 39 |
+
|
| 40 |
+
print(f"[merge-hf] base={base_dir} lora={lora_dir} -> {out_dir}", flush=True)
|
| 41 |
+
bridge = AutoBridge.from_hf_pretrained(hf_dir, trust_remote_code=True)
|
| 42 |
+
|
| 43 |
+
provider = bridge.to_megatron_provider(load_weights=False)
|
| 44 |
+
provider.tensor_model_parallel_size = 1
|
| 45 |
+
provider.pipeline_model_parallel_size = 1
|
| 46 |
+
provider.expert_model_parallel_size = 1
|
| 47 |
+
provider.expert_tensor_parallel_size = 1
|
| 48 |
+
provider.pipeline_dtype = torch.bfloat16
|
| 49 |
+
provider.initialize_model_parallel(seed=0)
|
| 50 |
+
|
| 51 |
+
model = bridge.load_megatron_model(base_dir, wrap_with_ddp=False)
|
| 52 |
+
|
| 53 |
+
# Recreate the adapter structure from the training run_config, then load
|
| 54 |
+
# only the adapter tensors from the finetune checkpoint.
|
| 55 |
+
run_cfg = read_run_config(str(lora_dir / "run_config.yaml"))
|
| 56 |
+
peft_cfg = run_cfg.get("peft", {}) or {}
|
| 57 |
+
allowed = {"target_modules", "dim", "alpha", "dropout", "dropout_position"}
|
| 58 |
+
peft_cfg = {k: v for k, v in peft_cfg.items() if k in allowed}
|
| 59 |
+
print(f"[merge-hf] LoRA structure: {peft_cfg}", flush=True)
|
| 60 |
+
lora_peft = LoRA(**peft_cfg)
|
| 61 |
+
model = lora_peft(model, training=False)
|
| 62 |
+
|
| 63 |
+
sharded_sd = _generate_model_state_dict(model, {})
|
| 64 |
+
sharded_sd = apply_peft_adapter_filter_to_state_dict(sharded_sd, lora_peft)
|
| 65 |
+
loaded = dist_checkpointing.load(sharded_sd, str(lora_dir))
|
| 66 |
+
key = "model" if "model" in loaded else next(k for k in loaded if k.startswith("model"))
|
| 67 |
+
missing = model[0].load_state_dict(loaded[key], strict=False)
|
| 68 |
+
n_adapter = len(loaded[key])
|
| 69 |
+
print(f"[merge-hf] loaded {n_adapter} adapter tensors (unexpected: {len(missing.unexpected_keys)})", flush=True)
|
| 70 |
+
if n_adapter == 0:
|
| 71 |
+
raise RuntimeError("no adapter tensors loaded — refusing to export an unmodified base model")
|
| 72 |
+
|
| 73 |
+
# save_hf_pretrained merges LoRALinear wrappers into dense weights during
|
| 74 |
+
# its streaming export; source_path preserves the custom Nemotron-H
|
| 75 |
+
# modeling files so the result is from_pretrained-loadable.
|
| 76 |
+
bridge.save_hf_pretrained(model, out_dir, source_path=hf_dir)
|
| 77 |
+
print("[merge-hf] export complete", flush=True)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
if __name__ == "__main__":
|
| 81 |
+
main()
|
finetune/nemo/run_merge_spark2.sh
CHANGED
|
@@ -30,7 +30,7 @@ docker run -d --name "$NAME" \
|
|
| 30 |
-v "$MODELS_DIR":/models \
|
| 31 |
-v "$WORK_DIR":/work \
|
| 32 |
--entrypoint torchrun nvcr.io/nvidia/nemo:25.11.nemotron_3_nano \
|
| 33 |
-
--nproc-per-node=1 --nnodes=1 /work/
|
| 34 |
|
| 35 |
echo "container started; watchdog polling every 5s (kill if MemAvailable < 12GB)"
|
| 36 |
while [ -n "$(docker ps -q -f name="$NAME")" ]; do
|
|
|
|
| 30 |
-v "$MODELS_DIR":/models \
|
| 31 |
-v "$WORK_DIR":/work \
|
| 32 |
--entrypoint torchrun nvcr.io/nvidia/nemo:25.11.nemotron_3_nano \
|
| 33 |
+
--nproc-per-node=1 --nnodes=1 /work/"${MERGE_SCRIPT:-run_merge_hf.py}" || exit 1
|
| 34 |
|
| 35 |
echo "container started; watchdog polling every 5s (kill if MemAvailable < 12GB)"
|
| 36 |
while [ -n "$(docker ps -q -f name="$NAME")" ]; do
|