TeddyBearKinova / bundle /code /openpi /scripts /archive_teddybear_checkpoints_to_hf.sh
lsnu's picture
Upload folder using huggingface_hub
d93804e verified
#!/usr/bin/env bash
set -uo pipefail
if [[ $# -lt 4 ]]; then
echo "usage: $0 <repo_id> <checkpoint_root> <status_log> <state_file> [interval_s]" >&2
exit 1
fi
REPO_ID="$1"
CHECKPOINT_ROOT="$2"
STATUS_LOG="$3"
STATE_FILE="$4"
INTERVAL_S="${5:-120}"
mkdir -p "$(dirname "$STATUS_LOG")"
mkdir -p "$(dirname "$STATE_FILE")"
touch "$STATE_FILE"
log() {
printf '%s | %s\n' "$(date -u +%FT%TZ)" "$*" >> "$STATUS_LOG"
}
already_done() {
grep -qx "$1" "$STATE_FILE" 2>/dev/null
}
mark_done() {
printf '%s\n' "$1" >> "$STATE_FILE"
}
checkpoint_ready() {
local step="$1"
local src="${CHECKPOINT_ROOT}/${step}"
[[ -f "${src}/_CHECKPOINT_METADATA" && -f "${src}/params/_METADATA" ]]
}
verify_remote_step() {
local step="$1"
local verify_dir="/workspace/hf_verify_${step}"
rm -rf "$verify_dir"
mkdir -p "$verify_dir"
hf download "$REPO_ID" "${step}/_CHECKPOINT_METADATA" --type model --local-dir "$verify_dir" >/dev/null
hf download "$REPO_ID" "${step}/params/_METADATA" --type model --local-dir "$verify_dir" >/dev/null
}
archive_step() {
local step="$1"
local src="${CHECKPOINT_ROOT}/${step}"
local stage="/workspace/hf_stage_${step}"
if [[ ! -d "$src" ]]; then
log "skip step=${step} missing_local_dir"
return 0
fi
rm -rf "$stage"
mkdir -p "$stage"
cp -al "$src" "$stage/"
log "upload_start step=${step}"
hf upload-large-folder "$REPO_ID" "$stage" --type model --num-workers 8 --no-bars >> "$STATUS_LOG" 2>&1
verify_remote_step "$step"
rm -rf "$src" "$stage"
mark_done "$step"
log "upload_done step=${step}"
}
log "archiver_start repo=${REPO_ID} root=${CHECKPOINT_ROOT}"
while true; do
if [[ ! -d "$CHECKPOINT_ROOT" ]]; then
log "checkpoint_root_missing"
sleep "$INTERVAL_S"
continue
fi
while IFS= read -r step; do
[[ -z "$step" ]] && continue
if already_done "$step"; then
continue
fi
if ! checkpoint_ready "$step"; then
log "skip step=${step} checkpoint_not_ready"
continue
fi
archive_step "$step" || log "upload_failed step=${step}"
done < <(find "$CHECKPOINT_ROOT" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' | rg '^[0-9]+$' | sort -n)
sleep "$INTERVAL_S"
done