File size: 2,400 Bytes
1d0c0cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
{
  "run_name": "sfp4_v4_sparse09_hpo_on_ours_p_init2050_1n_interactive",
  "checkpoint": "checkpoint-700",
  "training_method": "legacy_sft_wan_training_pipeline",
  "model_path": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
  "init_weights_from_safetensors": "checkpoints/init/sfp4_v4_sparse06_hpo_on_ours_p_1n_interactive_v2_ckpt2050/transformer/diffusion_pytorch_model.safetensors",
  "environment": {
    "FASTVIDEO_ATTENTION_BACKEND": "SPARSE_FP4_OURS_P_ATTN",
    "FASTVIDEO_SPARSE_FP4_USE_HIGH_PREC_O": "1",
    "FASTVIDEO_VALIDATION_ONE_PROMPT_PER_RANK": "1",
    "WANDB_MODE": "online",
    "WANDB_RESUME": "allow"
  },
  "vsa_schedule": {
    "VSA_SPARSITY": 0.9,
    "VSA_INIT_SPARSITY": 0.9,
    "VSA_WARMUP_STEPS": 0,
    "VSA_DECAY_RATE": 0.03,
    "VSA_DECAY_INTERVAL_STEPS": 50,
    "effective_sparsity_from_step_0": 0.9
  },
  "attention_semantics": {
    "selected_backend": "SPARSE_FP4_OURS_P_ATTN",
    "self_attention": {
      "backend_path": "fastvideo/attention/backends/sparse_fp4_ours_p_attn.py",
      "kernel_path": "fastvideo-kernel/python/fastvideo_kernel/triton_kernels/block_sparse_attn_triton_ours_p.py",
      "tile_size_video": [4, 4, 4],
      "tile_tokens": 64,
      "qkv_quantization": "FP4 fake quantization with STE, no q/k mean subtraction in quantization",
      "block_selection": "top-k blocks from q_c @ k_c tile-mean scores",
      "p_quantization": "group-local exp2(qk - group_max) FP4 fake quantization; compensation multiplies exp2(group_max - running_row_m)",
      "dropped_tile_handling": "tile-level q_mean/k_mean score and mean_v compensation"
    },
    "cross_attention": {
      "backend": "dense_sdpa",
      "reason": "sparse_fp4_ours_p_attn.py treats query_length != key_length as cross attention and returns _dense_sdpa_blhd",
      "quantized": false,
      "sparse": false
    },
    "force_dense": {
      "backend": "dense_sdpa",
      "used_for": "teacher or explicitly forced dense paths, not the normal SFT student self-attention path"
    }
  },
  "validation_and_checkpointing": {
    "save_steps": 50,
    "eval_steps": 50,
    "validation_sampling_steps": 50,
    "validation_guidance_scale": 5.0,
    "checkpoints_total_limit": 5,
    "flow_shift": 1.0
  },
  "training_shape": {
    "num_latent_t": 20,
    "num_frames": 77,
    "height": 448,
    "width": 832,
    "batch_size_per_gpu": 1,
    "sp_size": 1,
    "tp_size": 1
  }
}