| """估算 E2EAVModel 在 BS≥8 训练时的显存/内存需求。 |
| |
| 输出 |
| - 各模块参数数量 |
| - 训练显存细分:参数 / 优化器 / 梯度 / 主激活 / 多任务梯度副本 / 缓冲 |
| - 推荐设备(HF Sandbox / Jobs) |
| - 主机内存与磁盘开销 |
| |
| 公式说明(粗略上界) |
| - 参数 (bf16): 2 B/p;fp32 主副本: 4 B/p |
| - AdamW 一阶/二阶矩 (fp32): 8 B/p |
| - 梯度 (fp32): 4 B/p |
| - bf16 训练总计:参数 2 + 主 4 + AdamW 8 + grad 4 = 18 B/可训练 p |
| - DINOv3 冻结 Stage1:仅 2 B/p(前向激活按 no_grad 释放,可忽略) |
| - 主激活:每层约 ``B * N * D * 2 B``(bf16),18 层;MoE 层另加 8 个专家 |
| SwiGLU 中间 ``B * N * 2 * 4D * 2 B`` 的临时项,但 Dense 加权求和后只 |
| 需 1 份输出。实际显存按"激活 = 单层峰值 × 层数"近似。 |
| - PCGrad 在共享参数上 N 次 ``autograd.grad``:需要 retain_graph, |
| 每个任务额外保留中间激活的引用,最坏放大 N 倍。这里按 1.5x 估算 |
| (GPU autograd 内部 reuse + checkpointing 后通常远低于 N 倍)。 |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| from pathlib import Path |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(ROOT / "src")) |
|
|
| from dataclasses import dataclass |
|
|
| from wjad.model import E2EAVModel |
|
|
|
|
| @dataclass |
| class MemoryReport: |
| bs: int |
| seq_len: int |
| dim: int |
| layers: int |
| params_total: int |
| params_trainable_stage1: int |
| params_trainable_stage2: int |
| weights_gb_stage1: float |
| weights_gb_stage2: float |
| optim_gb_stage1: float |
| optim_gb_stage2: float |
| activations_gb: float |
| pcgrad_overhead_gb: float |
| total_stage1_gb: float |
| total_stage2_gb: float |
| host_ram_gb: float |
| disk_gb: float |
|
|
|
|
| def count_params(model) -> tuple[int, dict[str, int]]: |
| total = 0 |
| by_module: dict[str, int] = {} |
| for name, child in model.named_children(): |
| n = sum(p.numel() for p in child.parameters()) |
| by_module[name] = n |
| total += n |
| return total, by_module |
|
|
|
|
| def estimate(bs: int = 8) -> MemoryReport: |
| model = E2EAVModel( |
| dinov3_path=str(ROOT / "dinov3-vitb16-pretrain-lvd1689m"), |
| |
| backbone_dim=768, |
| num_heads=12, |
| num_dense_layers=9, |
| num_moe_layers=9, |
| num_routed_experts=7, |
| num_shared_experts=1, |
| topk_experts=3, |
| ffn_mult=4, |
| num_history_frames=8, |
| num_detection_tokens=1024, |
| num_control_tokens=24, |
| num_ego_tokens=8, |
| num_extra_tokens=256, |
| image_h=384, |
| image_w=1024, |
| patch_size=16, |
| num_classes=22, |
| traj_horizon=24, |
| freeze_dinov3=True, |
| ) |
|
|
| total, by_module = count_params(model) |
| dinov3_n = by_module.get("dinov3", 0) |
| trainable_stage1 = total - dinov3_n |
| trainable_stage2 = total |
|
|
| |
| n_visual = (8 // 2) * (24 // 2) * (64 // 2) |
| seq_len = n_visual + 8 + 1024 + 24 + 256 |
|
|
| |
| |
| GB = 1024 ** 3 |
| weights_stage1 = (dinov3_n * 2 + trainable_stage1 * 2) / GB |
| weights_stage2 = (total * 2) / GB |
| optim_stage1 = (trainable_stage1 * (4 + 4 + 4)) / GB |
| optim_stage2 = (trainable_stage2 * (4 + 4 + 4)) / GB |
|
|
| |
| base_act = bs * seq_len * 768 * 2 * (18 + 6) * 1.5 |
| |
| moe_act = bs * seq_len * 3072 * 2 * 8 * 9 |
| |
| dino_act = 2.0 * GB |
| activations_gb = (base_act + moe_act + dino_act) / GB |
|
|
| |
| |
| pcgrad_overhead_gb = 0.5 * activations_gb |
|
|
| total_stage1 = weights_stage1 + optim_stage1 + activations_gb + pcgrad_overhead_gb + 2.0 |
| total_stage2 = weights_stage2 + optim_stage2 + activations_gb + pcgrad_overhead_gb + 2.0 |
|
|
| |
| |
| host_ram = 8.0 + bs * 0.3 * 4 * 2 |
|
|
| |
| |
| disk = 50.0 |
|
|
| return MemoryReport( |
| bs=bs, |
| seq_len=seq_len, |
| dim=768, |
| layers=18, |
| params_total=total, |
| params_trainable_stage1=trainable_stage1, |
| params_trainable_stage2=trainable_stage2, |
| weights_gb_stage1=weights_stage1, |
| weights_gb_stage2=weights_stage2, |
| optim_gb_stage1=optim_stage1, |
| optim_gb_stage2=optim_stage2, |
| activations_gb=activations_gb, |
| pcgrad_overhead_gb=pcgrad_overhead_gb, |
| total_stage1_gb=total_stage1, |
| total_stage2_gb=total_stage2, |
| host_ram_gb=host_ram, |
| disk_gb=disk, |
| ) |
|
|
|
|
| def recommend_device(stage_max_gb: float) -> tuple[str, str]: |
| """根据 Stage2 峰值显存推荐 GPU。""" |
| margin = 1.15 |
| need = stage_max_gb * margin |
| candidates = [ |
| ("T4 16GB", 16), |
| ("L4 24GB", 24), |
| ("A10G 24GB", 24), |
| ("A10G Large 48GB", 48), |
| ("A100 40GB", 40), |
| ("L40S 48GB", 48), |
| ("A100 80GB", 80), |
| ("H100 80GB", 80), |
| ] |
| fit = [c for c in candidates if c[1] >= need] |
| if not fit: |
| return "H200 / 多卡 80GB+", f"需要 ≥{need:.1f} GB(单卡极限)" |
| return fit[0][0], f"需要 ≥{need:.1f} GB" |
|
|
|
|
| def main() -> None: |
| print("=" * 72) |
| print(" WJAD 训练显存/内存估算 (bf16 AMP)") |
| print("=" * 72) |
| for bs in (1, 2, 4, 8, 16): |
| r = estimate(bs) |
| print(f"\n--- BS = {bs} ---") |
| print(f" 总参数 : {r.params_total / 1e6:8.2f} M") |
| print(f" 可训练 (S1) : {r.params_trainable_stage1 / 1e6:8.2f} M") |
| print(f" 可训练 (S2) : {r.params_trainable_stage2 / 1e6:8.2f} M") |
| print(f" 序列长度 : {r.seq_len}") |
| print(f" 权重 (S1/S2) : {r.weights_gb_stage1:6.2f} / {r.weights_gb_stage2:6.2f} GB") |
| print(f" 优化器 (S1/S2): {r.optim_gb_stage1:6.2f} / {r.optim_gb_stage2:6.2f} GB") |
| print(f" 激活 : {r.activations_gb:6.2f} GB") |
| print(f" PCGrad 余量 : {r.pcgrad_overhead_gb:6.2f} GB") |
| print(f" 显存合计 S1 : {r.total_stage1_gb:6.2f} GB") |
| print(f" 显存合计 S2 : {r.total_stage2_gb:6.2f} GB <- 峰值") |
| gpu, note = recommend_device(r.total_stage2_gb) |
| print(f" 推荐 GPU : {gpu} ({note})") |
| print(f" 主机 RAM : ≥ {r.host_ram_gb:6.2f} GB") |
| print(f" 磁盘 (典型) : ≈ {r.disk_gb:6.0f} GB") |
|
|
| print() |
| print("说明:") |
| print(" - 估算包含 bf16 AMP + AdamW(m,v fp32) + 梯度 fp32 主副本 + PCGrad 开销。") |
| print(" - 开 ``gradient_checkpointing`` 可把激活降至约 1/3,BS 可成倍提升。") |
| print(" - 实测请用 ``nvidia-smi`` 或 ``torch.cuda.max_memory_allocated()`` 校准。") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|