IMJONEZZ commited on
Commit
7bd2c00
·
1 Parent(s): 11143b6

finetune: merge LoRA via streaming HF export (avoids 2x-model save peak)

Browse files
finetune/nemo/run_merge_hf.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Merge the Warden LoRA into the base and export DIRECTLY to HF safetensors.
3
+
4
+ Why not the stock merge_lora.py: its save_megatron_model stages a second
5
+ ~63GB copy of the model during the torch_dist save — on a 121GB unified-memory
6
+ GB10 that's what tripped the watchdog (and previously froze spark-eee9).
7
+ AutoBridge.save_hf_pretrained instead streams tensor-by-tensor into sharded
8
+ safetensors (peak ≈ model + one shard) and auto-merges LoRA adapters on the
9
+ way out. It also skips the merged-Megatron intermediate entirely: HF format
10
+ is what the GGUF conversion and the ZeroGPU Space load anyway.
11
+
12
+ Load + adapter-patch steps mirror /opt/Megatron-Bridge/examples/peft/merge_lora.py.
13
+ """
14
+ import multiprocessing as mp
15
+
16
+ mp.set_start_method("forkserver", force=True)
17
+
18
+ import os # noqa: E402
19
+ import sys # noqa: E402
20
+
21
+
22
+ def main():
23
+ from pathlib import Path
24
+
25
+ import torch
26
+ from megatron.core import dist_checkpointing
27
+ from megatron.bridge.models.conversion.auto_bridge import AutoBridge
28
+ from megatron.bridge.peft.lora import LoRA
29
+ from megatron.bridge.training.checkpointing import (
30
+ _generate_model_state_dict,
31
+ apply_peft_adapter_filter_to_state_dict,
32
+ )
33
+ from megatron.bridge.training.utils.checkpoint_utils import read_run_config
34
+
35
+ lora_dir = Path(os.environ.get("LORA_CKPT", "/work/runs/warden-dora/checkpoints/iter_0000150"))
36
+ hf_dir = os.environ.get("HF_DIR", "/models/nemotron-3-nano-30b-bf16")
37
+ base_dir = os.environ.get("MEGATRON_CKPT", "/models/nemotron-3-nano-30b-megatron")
38
+ out_dir = os.environ.get("OUT_DIR", "/models/nemotron-3-nano-30b-warden-hf")
39
+
40
+ print(f"[merge-hf] base={base_dir} lora={lora_dir} -> {out_dir}", flush=True)
41
+ bridge = AutoBridge.from_hf_pretrained(hf_dir, trust_remote_code=True)
42
+
43
+ provider = bridge.to_megatron_provider(load_weights=False)
44
+ provider.tensor_model_parallel_size = 1
45
+ provider.pipeline_model_parallel_size = 1
46
+ provider.expert_model_parallel_size = 1
47
+ provider.expert_tensor_parallel_size = 1
48
+ provider.pipeline_dtype = torch.bfloat16
49
+ provider.initialize_model_parallel(seed=0)
50
+
51
+ model = bridge.load_megatron_model(base_dir, wrap_with_ddp=False)
52
+
53
+ # Recreate the adapter structure from the training run_config, then load
54
+ # only the adapter tensors from the finetune checkpoint.
55
+ run_cfg = read_run_config(str(lora_dir / "run_config.yaml"))
56
+ peft_cfg = run_cfg.get("peft", {}) or {}
57
+ allowed = {"target_modules", "dim", "alpha", "dropout", "dropout_position"}
58
+ peft_cfg = {k: v for k, v in peft_cfg.items() if k in allowed}
59
+ print(f"[merge-hf] LoRA structure: {peft_cfg}", flush=True)
60
+ lora_peft = LoRA(**peft_cfg)
61
+ model = lora_peft(model, training=False)
62
+
63
+ sharded_sd = _generate_model_state_dict(model, {})
64
+ sharded_sd = apply_peft_adapter_filter_to_state_dict(sharded_sd, lora_peft)
65
+ loaded = dist_checkpointing.load(sharded_sd, str(lora_dir))
66
+ key = "model" if "model" in loaded else next(k for k in loaded if k.startswith("model"))
67
+ missing = model[0].load_state_dict(loaded[key], strict=False)
68
+ n_adapter = len(loaded[key])
69
+ print(f"[merge-hf] loaded {n_adapter} adapter tensors (unexpected: {len(missing.unexpected_keys)})", flush=True)
70
+ if n_adapter == 0:
71
+ raise RuntimeError("no adapter tensors loaded — refusing to export an unmodified base model")
72
+
73
+ # save_hf_pretrained merges LoRALinear wrappers into dense weights during
74
+ # its streaming export; source_path preserves the custom Nemotron-H
75
+ # modeling files so the result is from_pretrained-loadable.
76
+ bridge.save_hf_pretrained(model, out_dir, source_path=hf_dir)
77
+ print("[merge-hf] export complete", flush=True)
78
+
79
+
80
+ if __name__ == "__main__":
81
+ main()
finetune/nemo/run_merge_spark2.sh CHANGED
@@ -30,7 +30,7 @@ docker run -d --name "$NAME" \
30
  -v "$MODELS_DIR":/models \
31
  -v "$WORK_DIR":/work \
32
  --entrypoint torchrun nvcr.io/nvidia/nemo:25.11.nemotron_3_nano \
33
- --nproc-per-node=1 --nnodes=1 /work/run_merge.py || exit 1
34
 
35
  echo "container started; watchdog polling every 5s (kill if MemAvailable < 12GB)"
36
  while [ -n "$(docker ps -q -f name="$NAME")" ]; do
 
30
  -v "$MODELS_DIR":/models \
31
  -v "$WORK_DIR":/work \
32
  --entrypoint torchrun nvcr.io/nvidia/nemo:25.11.nemotron_3_nano \
33
+ --nproc-per-node=1 --nnodes=1 /work/"${MERGE_SCRIPT:-run_merge_hf.py}" || exit 1
34
 
35
  echo "container started; watchdog polling every 5s (kill if MemAvailable < 12GB)"
36
  while [ -n "$(docker ps -q -f name="$NAME")" ]; do