| #!/usr/bin/env bash |
| set -uo pipefail |
|
|
| if [[ $# -lt 4 ]]; then |
| echo "usage: $0 <repo_id> <checkpoint_root> <status_log> <state_file> [interval_s]" >&2 |
| exit 1 |
| fi |
|
|
| REPO_ID="$1" |
| CHECKPOINT_ROOT="$2" |
| STATUS_LOG="$3" |
| STATE_FILE="$4" |
| INTERVAL_S="${5:-120}" |
|
|
| mkdir -p "$(dirname "$STATUS_LOG")" |
| mkdir -p "$(dirname "$STATE_FILE")" |
| touch "$STATE_FILE" |
|
|
| log() { |
| printf '%s | %s\n' "$(date -u +%FT%TZ)" "$*" >> "$STATUS_LOG" |
| } |
|
|
| already_done() { |
| grep -qx "$1" "$STATE_FILE" 2>/dev/null |
| } |
|
|
| mark_done() { |
| printf '%s\n' "$1" >> "$STATE_FILE" |
| } |
|
|
| checkpoint_ready() { |
| local step="$1" |
| local src="${CHECKPOINT_ROOT}/${step}" |
| [[ -f "${src}/_CHECKPOINT_METADATA" && -f "${src}/params/_METADATA" ]] |
| } |
|
|
| verify_remote_step() { |
| local step="$1" |
| local verify_dir="/workspace/hf_verify_${step}" |
| rm -rf "$verify_dir" |
| mkdir -p "$verify_dir" |
| hf download "$REPO_ID" "${step}/_CHECKPOINT_METADATA" --type model --local-dir "$verify_dir" >/dev/null |
| hf download "$REPO_ID" "${step}/params/_METADATA" --type model --local-dir "$verify_dir" >/dev/null |
| } |
|
|
| archive_step() { |
| local step="$1" |
| local src="${CHECKPOINT_ROOT}/${step}" |
| local stage="/workspace/hf_stage_${step}" |
|
|
| if [[ ! -d "$src" ]]; then |
| log "skip step=${step} missing_local_dir" |
| return 0 |
| fi |
|
|
| rm -rf "$stage" |
| mkdir -p "$stage" |
| cp -al "$src" "$stage/" |
| log "upload_start step=${step}" |
| hf upload-large-folder "$REPO_ID" "$stage" --type model --num-workers 8 --no-bars >> "$STATUS_LOG" 2>&1 |
| verify_remote_step "$step" |
| rm -rf "$src" "$stage" |
| mark_done "$step" |
| log "upload_done step=${step}" |
| } |
|
|
| log "archiver_start repo=${REPO_ID} root=${CHECKPOINT_ROOT}" |
|
|
| while true; do |
| if [[ ! -d "$CHECKPOINT_ROOT" ]]; then |
| log "checkpoint_root_missing" |
| sleep "$INTERVAL_S" |
| continue |
| fi |
|
|
| while IFS= read -r step; do |
| [[ -z "$step" ]] && continue |
| if already_done "$step"; then |
| continue |
| fi |
| if ! checkpoint_ready "$step"; then |
| log "skip step=${step} checkpoint_not_ready" |
| continue |
| fi |
| archive_step "$step" || log "upload_failed step=${step}" |
| done < <(find "$CHECKPOINT_ROOT" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' | rg '^[0-9]+$' | sort -n) |
|
|
| sleep "$INTERVAL_S" |
| done |
|
|