Checkpoint is broken because tid2eid is all-zeros

#1
by gentry1337 - opened
import json

import torch
from huggingface_hub import hf_hub_download
from safetensors.torch import safe_open


REDHAT = "RedHatAI/DeepSeek-V4-Flash-BF16"
ORIGINAL = "deepseek-ai/DeepSeek-V4-Flash"
KEYS = [f"layers.{i}.ffn.gate.tid2eid" for i in range(3)]


def load_tid2eid(repo: str, key: str) -> torch.Tensor:
    index_path = hf_hub_download(repo, "model.safetensors.index.json")
    with open(index_path) as f:
        shard = json.load(f)["weight_map"][key]

    shard_path = hf_hub_download(repo, shard)
    with safe_open(shard_path, framework="pt", device="cpu") as f:
        tensor = f.get_tensor(key)

    print(f"{repo} {key}")
    print(f"  shard: {shard}")
    print(f"  shape: {tuple(tensor.shape)} dtype: {tensor.dtype}")
    print(f"  min/max: {tensor.min().item()}/{tensor.max().item()}")
    print(f"  all_zero: {bool(torch.all(tensor == 0).item())}")
    print(f"  nonzero: {torch.count_nonzero(tensor).item()}/{tensor.numel()}")
    print(f"  sample: {tensor[:3].tolist()}")
    return tensor


for key in KEYS:
    redhat = load_tid2eid(REDHAT, key)
    original = load_tid2eid(ORIGINAL, key)
    print(f"compare {key}")
    print(f"  equal: {torch.equal(redhat, original)}")
    print(f"  differing_entries: {(redhat != original).sum().item()}/{redhat.numel()}")
    print()

Output:

RedHatAI/DeepSeek-V4-Flash-BF16 layers.0.ffn.gate.tid2eid
  shard: model-00001-of-00013.safetensors
  shape: (129280, 6) dtype: torch.int64
  min/max: 0/0
  all_zero: True
  nonzero: 0/775680
  sample: [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
model-00002-of-00046.safetensors: 100%|███████████████████████████████████████████████████████████████████| 3.57G/3.57G [00:12<00:00, 297MB/s]
deepseek-ai/DeepSeek-V4-Flash layers.0.ffn.gate.tid2eid
  shard: model-00002-of-00046.safetensors
  shape: (129280, 6) dtype: torch.int64
  min/max: 0/255
  all_zero: False
  nonzero: 772408/775680
  sample: [[254, 222, 245, 200, 53, 35], [239, 202, 122, 23, 115, 57], [38, 108, 37, 228, 45, 96]]
compare layers.0.ffn.gate.tid2eid
  equal: False
  differing_entries: 772408/775680

RedHatAI/DeepSeek-V4-Flash-BF16 layers.1.ffn.gate.tid2eid
  shard: model-00001-of-00013.safetensors
  shape: (129280, 6) dtype: torch.int64
  min/max: 0/0
  all_zero: True
  nonzero: 0/775680
  sample: [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
model-00003-of-00046.safetensors: 100%|███████████████████████████████████████████████████████████████████| 3.57G/3.57G [00:11<00:00, 313MB/s]
deepseek-ai/DeepSeek-V4-Flash layers.1.ffn.gate.tid2eid
  shard: model-00003-of-00046.safetensors
  shape: (129280, 6) dtype: torch.int64
  min/max: 0/255
  all_zero: False
  nonzero: 772391/775680
  sample: [[163, 137, 158, 97, 184, 8], [63, 126, 219, 21, 170, 150], [46, 73, 237, 13, 16, 17]]
compare layers.1.ffn.gate.tid2eid
  equal: False
  differing_entries: 772391/775680

RedHatAI/DeepSeek-V4-Flash-BF16 layers.2.ffn.gate.tid2eid
  shard: model-00001-of-00013.safetensors
  shape: (129280, 6) dtype: torch.int64
  min/max: 0/0
  all_zero: True
  nonzero: 0/775680
  sample: [[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
model-00004-of-00046.safetensors: 100%|███████████████████████████████████████████████████████████████████| 3.60G/3.60G [00:12<00:00, 299MB/s]
deepseek-ai/DeepSeek-V4-Flash layers.2.ffn.gate.tid2eid
  shard: model-00004-of-00046.safetensors
  shape: (129280, 6) dtype: torch.int64
  min/max: 0/255
  all_zero: False
  nonzero: 772362/775680
  sample: [[108, 115, 43, 53, 132, 143], [217, 221, 240, 26, 247, 39], [87, 185, 66, 216, 200, 153]]
compare layers.2.ffn.gate.tid2eid
  equal: False
  differing_entries: 772362/775680

Sign up or log in to comment