Buckets:

Mercity/FluxDistill / scripts /run_nvfp4_cell.sh
Pranav2748's picture
download
raw
1.72 kB
#!/usr/bin/env bash
# Run ONE NVFP4 SVDQuant cell (build + eval) with its own logs. NVFP4 = Blackwell-native FP4
# (E2M1 elements, group-16, FP8 block scales) on the residual weights + low-rank branch in bf16.
# Recipe fixed to the converged form: plain SVD + refine, NO smoothing (per RESULTS.md).
#
# Usage: bash scripts/run_nvfp4_cell.sh <RANK> <mode>
# mode = w4a4 : nvfp4 weights (g16) + nvfp4 acts (g16) — the deployable Blackwell format
# mode = w4a8fp8 : nvfp4 weights (g16) + FP8(E4M3) acts — max-quality NVFP4-weight variant
set -uo pipefail
cd /workspace
export PYTHONPATH=. HF_HOME=/workspace/.cache/huggingface PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
R="$1"; mode="$2"
if [ "$mode" = "w4a8fp8" ]; then
WFMT=nvfp4; AFMT=fp8; WG=16; AG=0; AB=8; tag="nvfp4_r${R}_w4nvfp4_a8fp8"
else
WFMT=nvfp4; AFMT=nvfp4; WG=16; AG=16; AB=4; tag="nvfp4_r${R}_w4a4g16"
fi
OUT="outputs/${tag}"; blog="tmp/${tag}.build.log"; elog="tmp/${tag}.eval.log"
echo ">>> [$(date +%H:%M:%S)] BUILD $tag (WFMT=$WFMT AFMT=$AFMT WG=$WG AG=$AG RANK=$R)"
RANK=$R ALPHA=0.5 WBITS=4 ABITS=$AB WGROUP=$WG AGROUP=$AG WFMT=$WFMT AFMT=$AFMT \
N_CALIB=300 WHITEN=0 REFINE=3 SMOOTH=0 CALIB_DIR=data/monet_cache MB=4 OUT="$OUT" \
python3 -u scripts/12_build_svdquant.py 2>&1 | tee "$blog"
grep -q "DONE ->" "$blog" || { echo "!!! BUILD FAILED $tag"; tail -8 "$blog"; exit 1; }
echo ">>> [$(date +%H:%M:%S)] EVAL $tag"
python3 -u scripts/13_eval_svdquant.py "$OUT" 2>&1 | tee "$elog"
grep -q "saved .* montages" "$elog" || { echo "!!! EVAL FAILED $tag"; tail -8 "$elog"; exit 1; }
echo "=== CELL DONE $tag :: $(grep eval_vel_loss= "$elog" | tail -1) ::"
grep -E "rel-err: mean|x smaller" "$blog" | sed 's/^/ /'

Xet Storage Details

Size:
1.72 kB
·
Xet hash:
5da8e1be564f2bef1c4a77aff478bc506784924868b5fefd497e3ce3898f6e7d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.