Buckets:

meshllm/layer-split-output / certify-model-job.sh
jamesdumay's picture
download
raw
7.03 kB
#!/bin/bash
set -euo pipefail
# This script runs inside an HF Job container.
# It clones mesh-llm, builds pinned llama.cpp/skippy correctness tools, runs
# family certification against a mounted GGUF repo, and uploads artifacts.
#
# Environment variables:
# SOURCE_REPO, SOURCE_FILE, MODEL_ID, FAMILY
# MESH_LLM_REF — git ref to build from (default: main)
# SOURCE_REVISION — source model revision (default: main)
# ARTIFACT_REPO — optional dataset repo for certification artifacts
# HF_TOKEN — injected as a secret when artifact upload is requested
#
# Optional family-certify controls:
# RUN_ID, LAYER_END, SPLIT_LAYER, SPLITS, ACTIVATION_WIDTH, PROMPT, CTX_SIZE
# N_GPU_LAYERS, WIRE_DTYPE, WIRE_DTYPES, STARTUP_TIMEOUT_SECS
# ALLOW_MISMATCH, STRICT_DTYPE, SKIP_CORRECTNESS, SKIP_DTYPE, SKIP_STATE
# PREFIX_TOKEN_COUNT, CACHE_HIT_REPEATS, BORROW_RESIDENT_HITS
#
# Volumes:
# /source — source GGUF repo (read-only mount)
# /bucket — writable storage bucket for script + certification workspace
MESH_LLM_REF="${MESH_LLM_REF:-main}"
SOURCE_REVISION="${SOURCE_REVISION:-main}"
RUN_ID="${RUN_ID:-hf-cert-$(date +%Y%m%d-%H%M%S)}"
JOB_WORK_ROOT="${JOB_WORK_ROOT:-/bucket/job-work}"
if [ -z "${JOB_WORK_DIR:-}" ]; then
SAFE_FAMILY="$(printf '%s' "$FAMILY" | tr -c '[:alnum:]._-' '_')"
JOB_WORK_DIR="${JOB_WORK_ROOT}/cert-${SAFE_FAMILY}-$(date +%Y%m%d%H%M%S)-$$"
CLEANUP_JOB_WORK_DIR="${CLEANUP_JOB_WORK_DIR:-true}"
else
CLEANUP_JOB_WORK_DIR="${CLEANUP_JOB_WORK_DIR:-false}"
fi
CERT_ROOT="${CERT_ROOT:-${JOB_WORK_DIR}/family-certify}"
export JOB_WORK_DIR
export RUN_ID CERT_ROOT
cleanup_job_work_dir() {
if [ "${CLEANUP_JOB_WORK_DIR}" = "true" ] && [ -n "${JOB_WORK_DIR:-}" ]; then
echo "Cleaning job work dir: ${JOB_WORK_DIR}"
rm -rf "$JOB_WORK_DIR"
fi
}
trap cleanup_job_work_dir EXIT
echo "╔══════════════════════════════════════════════════════════╗"
echo "║ Family Certification Job ║"
echo "╠══════════════════════════════════════════════════════════╣"
echo "║ Source: ${SOURCE_REPO}/${SOURCE_FILE}"
echo "║ Model: ${MODEL_ID}"
echo "║ Family: ${FAMILY}"
echo "║ Build: mesh-llm @ ${MESH_LLM_REF}"
echo "║ Run: ${RUN_ID}"
echo "║ Work: ${JOB_WORK_DIR}"
echo "╚══════════════════════════════════════════════════════════╝"
echo ""
echo "=== [1/8] Installing build dependencies ==="
apt-get update -qq && apt-get install -y -qq \
cmake git curl build-essential pkg-config libssl-dev \
python3-pip python3-venv jq > /dev/null 2>&1
echo "=== [2/8] Installing Rust ==="
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y > /dev/null 2>&1
source /root/.cargo/env
echo "=== [3/8] Cloning mesh-llm ==="
git clone --filter=blob:none https://github.com/Mesh-LLM/mesh-llm.git /tmp/build
cd /tmp/build
if git ls-remote --exit-code --heads origin "$MESH_LLM_REF" >/dev/null 2>&1 || \
git ls-remote --exit-code --tags origin "$MESH_LLM_REF" >/dev/null 2>&1; then
git fetch --depth 1 origin "$MESH_LLM_REF"
git checkout --detach FETCH_HEAD
else
git fetch --depth 1 origin "$MESH_LLM_REF"
git checkout --detach FETCH_HEAD
fi
# Full clone needed for git-am patches in prepare-llama.
sed -i 's/--filter=blob:none //' scripts/prepare-llama.sh
echo "=== [4/8] Building pinned llama.cpp CPU ABI ==="
scripts/prepare-llama.sh pinned 2>&1 | tail -20
LLAMA_BUILD_DIR="/tmp/build/.deps/llama-build/build-stage-abi-cpu"
LLAMA_STAGE_BUILD_DIR="$LLAMA_BUILD_DIR" scripts/build-llama.sh 2>&1 | tail -20
echo "=== [5/8] Verifying source model ==="
SOURCE_PATH="/source/${SOURCE_FILE}"
if [ ! -f "$SOURCE_PATH" ]; then
echo "ERROR: Source file not found at $SOURCE_PATH"
echo ""
echo "Available GGUF files in /source:"
find /source -name "*.gguf" -type f | sort | head -30
exit 1
fi
echo " Source: $SOURCE_PATH ($(du -h "$SOURCE_PATH" | cut -f1))"
echo "=== [6/8] Running family certification ==="
CERT_ARGS=(
--family "$FAMILY"
--target-model "$SOURCE_PATH"
--model-id "$MODEL_ID"
--cert-root "$CERT_ROOT"
--run-id "$RUN_ID"
)
append_opt() {
local env_name="$1"
local flag="$2"
local value="${!env_name:-}"
if [ -n "$value" ]; then
CERT_ARGS+=("$flag" "$value")
fi
}
append_bool() {
local env_name="$1"
local flag="$2"
local value="${!env_name:-}"
if [ "$value" = "true" ] || [ "$value" = "1" ]; then
CERT_ARGS+=("$flag")
fi
}
append_opt LAYER_END --layer-end
append_opt SPLIT_LAYER --split-layer
append_opt SPLITS --splits
append_opt ACTIVATION_WIDTH --activation-width
append_opt PROMPT --prompt
append_opt CTX_SIZE --ctx-size
append_opt N_GPU_LAYERS --n-gpu-layers
append_opt STARTUP_TIMEOUT_SECS --startup-timeout-secs
append_opt WIRE_DTYPE --wire-dtype
append_opt WIRE_DTYPES --wire-dtypes
append_opt PREFIX_TOKEN_COUNT --prefix-token-count
append_opt CACHE_HIT_REPEATS --cache-hit-repeats
append_bool ALLOW_MISMATCH --allow-mismatch
append_bool STRICT_DTYPE --strict-dtype
append_bool SKIP_CORRECTNESS --skip-correctness
append_bool SKIP_DTYPE --skip-dtype
append_bool SKIP_STATE --skip-state
append_bool BORROW_RESIDENT_HITS --borrow-resident-hits
LLAMA_STAGE_BUILD_DIR="$LLAMA_BUILD_DIR" scripts/family-certify.sh "${CERT_ARGS[@]}"
ARTIFACT_DIR="$(find "$CERT_ROOT/$RUN_ID" -mindepth 2 -maxdepth 2 -type d | head -1)"
if [ -z "$ARTIFACT_DIR" ] || [ ! -d "$ARTIFACT_DIR" ]; then
echo "ERROR: certification artifact directory not found under $CERT_ROOT/$RUN_ID"
exit 1
fi
export ARTIFACT_DIR
echo " ✓ Artifacts: $ARTIFACT_DIR"
echo "=== [7/8] Uploading artifacts ==="
if [ -z "${ARTIFACT_REPO:-}" ]; then
echo " ARTIFACT_REPO not set; leaving artifacts in the job workspace."
else
python3 -m venv /tmp/venv > /dev/null
/tmp/venv/bin/pip install -q huggingface_hub
/tmp/venv/bin/python3 << PYTHON
from huggingface_hub import HfApi
import os
api = HfApi(token=os.environ["HF_TOKEN"])
artifact_repo = os.environ["ARTIFACT_REPO"]
run_id = os.environ["RUN_ID"]
family = os.environ["FAMILY"]
model_id = os.environ["MODEL_ID"]
artifact_dir = os.environ["ARTIFACT_DIR"]
api.create_repo(artifact_repo, repo_type="dataset", exist_ok=True)
path_in_repo = f"runs/{run_id}"
api.upload_folder(
repo_id=artifact_repo,
repo_type="dataset",
folder_path=artifact_dir,
path_in_repo=path_in_repo,
commit_message=f"Certification artifacts for {family} ({model_id})",
)
print(f" ✓ Published: https://huggingface.co/datasets/{artifact_repo}/tree/main/{path_in_repo}")
PYTHON
fi
echo "=== [8/8] Done ==="
echo "Certification run completed for ${MODEL_ID} (${FAMILY})"

Xet Storage Details

Size:
7.03 kB
·
Xet hash:
a14adc1713686acb7f06d0d2dc191729c7d4b01b298908edfe6a329a41fe029e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.