"""估算 E2EAVModel 在 BS≥8 训练时的显存/内存需求。 输出 - 各模块参数数量 - 训练显存细分:参数 / 优化器 / 梯度 / 主激活 / 多任务梯度副本 / 缓冲 - 推荐设备(HF Sandbox / Jobs) - 主机内存与磁盘开销 公式说明(粗略上界) - 参数 (bf16): 2 B/p;fp32 主副本: 4 B/p - AdamW 一阶/二阶矩 (fp32): 8 B/p - 梯度 (fp32): 4 B/p - bf16 训练总计:参数 2 + 主 4 + AdamW 8 + grad 4 = 18 B/可训练 p - DINOv3 冻结 Stage1:仅 2 B/p(前向激活按 no_grad 释放,可忽略) - 主激活:每层约 ``B * N * D * 2 B``(bf16),18 层;MoE 层另加 8 个专家 SwiGLU 中间 ``B * N * 2 * 4D * 2 B`` 的临时项,但 Dense 加权求和后只 需 1 份输出。实际显存按"激活 = 单层峰值 × 层数"近似。 - PCGrad 在共享参数上 N 次 ``autograd.grad``:需要 retain_graph, 每个任务额外保留中间激活的引用,最坏放大 N 倍。这里按 1.5x 估算 (GPU autograd 内部 reuse + checkpointing 后通常远低于 N 倍)。 """ from __future__ import annotations import sys from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT / "src")) from dataclasses import dataclass from wjad.model import E2EAVModel @dataclass class MemoryReport: bs: int seq_len: int dim: int layers: int params_total: int params_trainable_stage1: int params_trainable_stage2: int weights_gb_stage1: float weights_gb_stage2: float optim_gb_stage1: float optim_gb_stage2: float activations_gb: float pcgrad_overhead_gb: float total_stage1_gb: float total_stage2_gb: float host_ram_gb: float disk_gb: float def count_params(model) -> tuple[int, dict[str, int]]: total = 0 by_module: dict[str, int] = {} for name, child in model.named_children(): n = sum(p.numel() for p in child.parameters()) by_module[name] = n total += n return total, by_module def estimate(bs: int = 8) -> MemoryReport: model = E2EAVModel( dinov3_path=str(ROOT / "dinov3-vitb16-pretrain-lvd1689m"), # 完整规模 backbone_dim=768, num_heads=12, num_dense_layers=9, num_moe_layers=9, num_routed_experts=7, num_shared_experts=1, topk_experts=3, ffn_mult=4, num_history_frames=8, num_detection_tokens=1024, num_control_tokens=24, num_ego_tokens=8, num_extra_tokens=256, image_h=384, image_w=1024, patch_size=16, num_classes=22, traj_horizon=24, freeze_dinov3=True, ) total, by_module = count_params(model) dinov3_n = by_module.get("dinov3", 0) trainable_stage1 = total - dinov3_n trainable_stage2 = total # 序列长度(拼接后总 token 数 + 上下文) n_visual = (8 // 2) * (24 // 2) * (64 // 2) seq_len = n_visual + 8 + 1024 + 24 + 256 # === 显存 === # 单位:GB(除以 1024**3) GB = 1024 ** 3 weights_stage1 = (dinov3_n * 2 + trainable_stage1 * 2) / GB # 全部 bf16 weights_stage2 = (total * 2) / GB optim_stage1 = (trainable_stage1 * (4 + 4 + 4)) / GB # master + m + v optim_stage2 = (trainable_stage2 * (4 + 4 + 4)) / GB # 激活:粗略 = bs * seq_len * dim * 2 * (num_layers + 1) * 1.5 (含 attn/FFN 重叠) base_act = bs * seq_len * 768 * 2 * (18 + 6) * 1.5 # 主干 18 + 校准 6 # MoE FFN 中间 (4D = 3072) 的临时项:每 MoE 层 ≈ bs * seq_len * 3072 * 2 * 8(8 专家) moe_act = bs * seq_len * 3072 * 2 * 8 * 9 # DINOv3 冻结:no_grad,前向激活在 forward 后立即释放,估 2 GB 峰值 dino_act = 2.0 * GB activations_gb = (base_act + moe_act + dino_act) / GB # PCGrad 开销(共享参数上 N 次 autograd.grad):retain_graph 阶段会 # 阻止激活释放,最坏接近 1.5x;这里按 +0.5x 估算 pcgrad_overhead_gb = 0.5 * activations_gb total_stage1 = weights_stage1 + optim_stage1 + activations_gb + pcgrad_overhead_gb + 2.0 total_stage2 = weights_stage2 + optim_stage2 + activations_gb + pcgrad_overhead_gb + 2.0 # === 主机 RAM === # DataLoader prefetch + workers + 模型 CPU 副本 + JSON / LIDAR 解析 host_ram = 8.0 + bs * 0.3 * 4 * 2 # 4 workers, prefetch 2 # === 磁盘 === # 全量数据集 ~3TB;只跑 sandbox 时 ~5GB(几个 clip);典型 ~50GB(一个 weather 全部) disk = 50.0 return MemoryReport( bs=bs, seq_len=seq_len, dim=768, layers=18, params_total=total, params_trainable_stage1=trainable_stage1, params_trainable_stage2=trainable_stage2, weights_gb_stage1=weights_stage1, weights_gb_stage2=weights_stage2, optim_gb_stage1=optim_stage1, optim_gb_stage2=optim_stage2, activations_gb=activations_gb, pcgrad_overhead_gb=pcgrad_overhead_gb, total_stage1_gb=total_stage1, total_stage2_gb=total_stage2, host_ram_gb=host_ram, disk_gb=disk, ) def recommend_device(stage_max_gb: float) -> tuple[str, str]: """根据 Stage2 峰值显存推荐 GPU。""" margin = 1.15 # 留 15% 余量(碎片化、CUDA caching、cuBLAS workspace) need = stage_max_gb * margin candidates = [ ("T4 16GB", 16), ("L4 24GB", 24), ("A10G 24GB", 24), ("A10G Large 48GB", 48), ("A100 40GB", 40), ("L40S 48GB", 48), ("A100 80GB", 80), ("H100 80GB", 80), ] fit = [c for c in candidates if c[1] >= need] if not fit: return "H200 / 多卡 80GB+", f"需要 ≥{need:.1f} GB(单卡极限)" return fit[0][0], f"需要 ≥{need:.1f} GB" def main() -> None: print("=" * 72) print(" WJAD 训练显存/内存估算 (bf16 AMP)") print("=" * 72) for bs in (1, 2, 4, 8, 16): r = estimate(bs) print(f"\n--- BS = {bs} ---") print(f" 总参数 : {r.params_total / 1e6:8.2f} M") print(f" 可训练 (S1) : {r.params_trainable_stage1 / 1e6:8.2f} M") print(f" 可训练 (S2) : {r.params_trainable_stage2 / 1e6:8.2f} M") print(f" 序列长度 : {r.seq_len}") print(f" 权重 (S1/S2) : {r.weights_gb_stage1:6.2f} / {r.weights_gb_stage2:6.2f} GB") print(f" 优化器 (S1/S2): {r.optim_gb_stage1:6.2f} / {r.optim_gb_stage2:6.2f} GB") print(f" 激活 : {r.activations_gb:6.2f} GB") print(f" PCGrad 余量 : {r.pcgrad_overhead_gb:6.2f} GB") print(f" 显存合计 S1 : {r.total_stage1_gb:6.2f} GB") print(f" 显存合计 S2 : {r.total_stage2_gb:6.2f} GB <- 峰值") gpu, note = recommend_device(r.total_stage2_gb) print(f" 推荐 GPU : {gpu} ({note})") print(f" 主机 RAM : ≥ {r.host_ram_gb:6.2f} GB") print(f" 磁盘 (典型) : ≈ {r.disk_gb:6.0f} GB") print() print("说明:") print(" - 估算包含 bf16 AMP + AdamW(m,v fp32) + 梯度 fp32 主副本 + PCGrad 开销。") print(" - 开 ``gradient_checkpointing`` 可把激活降至约 1/3,BS 可成倍提升。") print(" - 实测请用 ``nvidia-smi`` 或 ``torch.cuda.max_memory_allocated()`` 校准。") if __name__ == "__main__": main()