Add no-weights Docker image build path

Browse files

Files changed (8) hide show

.dockerignore +8 -0
.hfignore +2 -0
README.md +72 -0
docker/Dockerfile +56 -0
docker/download_sidecar.py +20 -0
docker/entrypoint.sh +255 -0
manifest.json +9 -0
scripts/build_docker_image.sh +17 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,8 @@

+.cache/
+patches/
+results/
+HANDOFF.md
+*.log
+__pycache__/
+**/__pycache__/
+*.pyc

.hfignore CHANGED Viewed

@@ -1,3 +1,5 @@
 HANDOFF.md
 patches/**
 .cache/**

 HANDOFF.md
 patches/**
 .cache/**
+__pycache__/**
+*.pyc

README.md CHANGED Viewed

@@ -25,6 +25,8 @@ This is an experimental reproducibility release, not a production-ready model. I
 - `scripts/setup_repro_from_hf.sh`: one-command setup for a new machine.
 - `scripts/serve_phase2_eagle.sh`: OpenAI-compatible vLLM server launcher.
 - `scripts/bench_tokens_sec_phase2_eagle.sh`: smoke/benchmark runner.
 - `scripts/test_triton_codebook_match.py`: isolated kernel equivalence harness.
 - `scripts/measure_kv_cache_compression.py`: live KV-cache measurement helper.
 - `results/`: selected validation outputs.
@@ -55,6 +57,76 @@ export HF_TOKEN=...
 Do not bake tokens into Docker images or committed files.
 ## One-Command Setup
 Pick a host directory. The setup script creates this layout:

 - `scripts/setup_repro_from_hf.sh`: one-command setup for a new machine.
 - `scripts/serve_phase2_eagle.sh`: OpenAI-compatible vLLM server launcher.
 - `scripts/bench_tokens_sec_phase2_eagle.sh`: smoke/benchmark runner.
+- `scripts/build_docker_image.sh`: builds a no-weights runtime image.
+- `docker/`: Dockerfile and entrypoint for the no-weights runtime image.
 - `scripts/test_triton_codebook_match.py`: isolated kernel equivalence harness.
 - `scripts/measure_kv_cache_compression.py`: live KV-cache measurement helper.
 - `results/`: selected validation outputs.
 Do not bake tokens into Docker images or committed files.
+## No-Weights Docker Image
+This is the simplest hosting path if you are willing to build an image. The image bakes in:
+```text
+vLLM Spectral fork at 008dd7f87fb9de185e536ad30b4d524024ed9b9f
+GemmaCut launcher entrypoint
+Spectral sidecar artifacts/spectral_sidecar_chat_v2.pt
+git/cmake/ninja build tools for inspection and follow-up work
+```
+It does **not** bake in model weights. `Intel/gemma-4-31B-it-int4-AutoRound` and `RedHatAI/gemma-4-31B-it-speculator.eagle3` are downloaded at runtime into the mounted Hugging Face cache.
+Build:
+```bash
+hf download satya007/gemmacut-spectral \
+  .dockerignore \
+  docker/Dockerfile \
+  docker/entrypoint.sh \
+  docker/download_sidecar.py \
+  scripts/build_docker_image.sh \
+  --local-dir ./gemmacut-spectral-image
+cd ./gemmacut-spectral-image
+chmod +x ./scripts/build_docker_image.sh
+IMAGE=gemmacut-spectral:008dd7f87 ./scripts/build_docker_image.sh
+```
+Smoke:
+```bash
+mkdir -p "$PWD/hf-cache" "$PWD/results"
+docker run --rm --gpus all --ipc=host \
+  -e HF_TOKEN \
+  -v "$PWD/hf-cache:/root/.cache/huggingface" \
+  -v "$PWD/results:/workspace/results_bench" \
+  gemmacut-spectral:008dd7f87 smoke
+```
+Serve:
+```bash
+docker run --rm --gpus all --ipc=host \
+  -p 8000:8000 \
+  -e HF_TOKEN \
+  -e MAX_MODEL_LEN=512 \
+  -e MAX_NUM_BATCHED_TOKENS=512 \
+  -e MAX_NUM_SEQS=2 \
+  -e GPU_MEMORY_UTILIZATION=0.8 \
+  -v "$PWD/hf-cache:/root/.cache/huggingface" \
+  gemmacut-spectral:008dd7f87 serve
+```
+Optional: build without the sidecar and mount it yourself.
+```bash
+IMAGE=gemmacut-spectral:008dd7f87-nosidecar \
+  ./scripts/build_docker_image.sh --build-arg INCLUDE_SIDECAR=0
+docker run --rm --gpus all --ipc=host \
+  -p 8000:8000 \
+  -e HF_TOKEN \
+  -e SPECTRAL_SIDECAR=/workspace/spectral_sidecar_chat_v2.pt \
+  -v "$PWD/hf-cache:/root/.cache/huggingface" \
+  -v "$PWD/spectral_sidecar_chat_v2.pt:/workspace/spectral_sidecar_chat_v2.pt:ro" \
+  gemmacut-spectral:008dd7f87-nosidecar serve
+```
 ## One-Command Setup
 Pick a host directory. The setup script creates this layout:

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,56 @@

+ARG BASE_IMAGE=vllm/vllm-openai:gemma4-cu130
+FROM ${BASE_IMAGE}
+ARG VLLM_REPO=https://github.com/bluecopa/vllm-spectral.git
+ARG VLLM_BRANCH=spectral-codebook-docker
+ARG VLLM_COMMIT=008dd7f87fb9de185e536ad30b4d524024ed9b9f
+ARG HF_REPO_ID=satya007/gemmacut-spectral
+ARG SIDECAR_SHA256=e47a36c13467cbedf720e7f782b976df3dcda2d989c727113a8315008661a3e4
+ARG INCLUDE_SIDECAR=1
+LABEL org.opencontainers.image.title="gemmacut-spectral"
+LABEL org.opencontainers.image.description="GemmaCut SpectralQuant Phase 2 + Eagle3 vLLM runtime; model weights are not baked into the image."
+LABEL org.opencontainers.image.source="https://github.com/bluecopa/vllm-spectral"
+LABEL org.opencontainers.image.revision="${VLLM_COMMIT}"
+ENV VLLM_SOURCE=/opt/vllm-spectral \
+    GEMMACUT_HOME=/opt/gemmacut \
+    SPECTRAL_SIDECAR=/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt \
+    HF_HUB_DISABLE_XET=1 \
+    SPECTRAL_TRITON_COMPRESS=1 \
+    SPECTRAL_TRITON_DEQUANT=1 \
+    SPECTRAL_CUDA_GRAPH=1 \
+    SPECTRAL_VERIFY=0 \
+    DISABLE_HYBRID_KV_CACHE_MANAGER=0
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+      ca-certificates \
+      cmake \
+      git \
+      ninja-build && \
+    rm -rf /var/lib/apt/lists/*
+RUN git clone --branch "${VLLM_BRANCH}" "${VLLM_REPO}" "${VLLM_SOURCE}" && \
+    git -C "${VLLM_SOURCE}" checkout "${VLLM_COMMIT}" && \
+    git -C "${VLLM_SOURCE}" log --oneline -1
+COPY docker/download_sidecar.py /tmp/download_sidecar.py
+RUN mkdir -p "${GEMMACUT_HOME}/artifacts" && \
+    if [[ "${INCLUDE_SIDECAR}" == "1" ]]; then \
+      HF_REPO_ID="${HF_REPO_ID}" \
+      SIDECAR_SHA256="${SIDECAR_SHA256}" \
+      python3 /tmp/download_sidecar.py; \
+    else \
+      echo "INCLUDE_SIDECAR=0; mount or set SPECTRAL_SIDECAR at runtime"; \
+    fi && \
+    rm -f /tmp/download_sidecar.py
+COPY docker/entrypoint.sh /usr/local/bin/gemmacut-spectral
+RUN chmod +x /usr/local/bin/gemmacut-spectral
+EXPOSE 8000
+ENTRYPOINT ["gemmacut-spectral"]
+CMD ["serve"]

docker/download_sidecar.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import hashlib
+import os
+import shutil
+from huggingface_hub import hf_hub_download
+repo_id = os.environ["HF_REPO_ID"]
+expected = os.environ["SIDECAR_SHA256"]
+target = "/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt"
+path = hf_hub_download(
+    repo_id=repo_id,
+    filename="artifacts/spectral_sidecar_chat_v2.pt",
+    repo_type="model",
+)
+shutil.copyfile(path, target)
+actual = hashlib.sha256(open(target, "rb").read()).hexdigest()
+if actual != expected:
+    raise SystemExit(f"sidecar sha256 mismatch: expected {expected}, got {actual}")
+print(f"sidecar ready: {target} sha256={actual}")

docker/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,255 @@

+#!/usr/bin/env bash
+set -euo pipefail
+COMMAND="${1:-serve}"
+if [ "$#" -gt 0 ]; then
+  shift
+fi
+MODEL="${MODEL:-Intel/gemma-4-31B-it-int4-AutoRound}"
+DRAFT="${DRAFT:-RedHatAI/gemma-4-31B-it-speculator.eagle3}"
+SERVED_MODEL_NAME="${SERVED_MODEL_NAME:-gemmacut-spectral}"
+SPECTRAL_SIDECAR="${SPECTRAL_SIDECAR:-/opt/gemmacut/artifacts/spectral_sidecar_chat_v2.pt}"
+VLLM_SOURCE="${VLLM_SOURCE:-/opt/vllm-spectral}"
+PORT="${PORT:-8000}"
+MAX_MODEL_LEN="${MAX_MODEL_LEN:-512}"
+MAX_NUM_BATCHED_TOKENS="${MAX_NUM_BATCHED_TOKENS:-512}"
+MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}"
+GPU_MEMORY_UTILIZATION="${GPU_MEMORY_UTILIZATION:-0.8}"
+NUM_SPEC_TOKENS="${NUM_SPEC_TOKENS:-3}"
+SPECTRAL_CUDA_GRAPH="${SPECTRAL_CUDA_GRAPH:-1}"
+VLLM_LOGGING_LEVEL="${VLLM_LOGGING_LEVEL:-INFO}"
+DISABLE_HYBRID_KV_CACHE_MANAGER="${DISABLE_HYBRID_KV_CACHE_MANAGER:-0}"
+RESULTS_ROOT="${RESULTS_ROOT:-/workspace/results_bench}"
+export VLLM_LOGGING_LEVEL
+export SPECTRAL_CUDA_GRAPH
+export SPECTRAL_TRITON_COMPRESS="${SPECTRAL_TRITON_COMPRESS:-1}"
+export SPECTRAL_TRITON_DEQUANT="${SPECTRAL_TRITON_DEQUANT:-1}"
+export SPECTRAL_VERIFY="${SPECTRAL_VERIFY:-0}"
+export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
+unset SPECTRAL_SHARED_ALLOC
+if [ "${HF_HUB_OFFLINE:-0}" = "1" ]; then
+  export HF_HUB_OFFLINE=1
+else
+  unset HF_HUB_OFFLINE
+fi
+prepare_overlay() {
+  local run_src="${SPECTRAL_RUN_SRC:-/tmp/vllm-spectral-run}"
+  local site
+  if [ ! -d "$VLLM_SOURCE" ]; then
+    echo "Missing VLLM_SOURCE: $VLLM_SOURCE" >&2
+    exit 1
+  fi
+  if [ ! -f "$SPECTRAL_SIDECAR" ]; then
+    echo "Missing SPECTRAL_SIDECAR: $SPECTRAL_SIDECAR" >&2
+    exit 1
+  fi
+  site="$(python3 - <<'PY'
+import pathlib
+import vllm
+print(pathlib.Path(vllm.__file__).resolve().parent)
+PY
+)"
+  rm -rf "$run_src"
+  cp -a "$VLLM_SOURCE" "$run_src"
+  shopt -s nullglob
+  for f in "$site"/_C*.so "$site"/_moe_C*.so "$site"/_flashmla*.so "$site"/cumem_allocator*.so; do
+    ln -sf "$f" "$run_src/vllm/"
+  done
+  mkdir -p "$run_src/vllm/vllm_flash_attn"
+  for f in "$site"/vllm_flash_attn/_vllm_fa2_C*.so "$site"/vllm_flash_attn/_vllm_fa3_C*.so; do
+    ln -sf "$f" "$run_src/vllm/vllm_flash_attn/"
+  done
+  ln -sfn "$site/vllm_flash_attn/cute" "$run_src/vllm/vllm_flash_attn/cute"
+  ln -sfn "$site/vllm_flash_attn/layers" "$run_src/vllm/vllm_flash_attn/layers"
+  mkdir -p "$run_src/vllm/third_party" "$run_src/vllm/third_party/flashmla"
+  ln -sfn "$site/third_party/triton_kernels" "$run_src/vllm/third_party/triton_kernels"
+  ln -sf "$site/third_party/flashmla/flash_mla_interface.py" "$run_src/vllm/third_party/flashmla/"
+  shopt -u nullglob
+  export PYTHONPATH="$run_src:$run_src/vllm/third_party${PYTHONPATH:+:$PYTHONPATH}"
+}
+server_args() {
+  local args=(
+    --host "${HOST:-0.0.0.0}"
+    --port "$PORT"
+    --model "$MODEL"
+    --served-model-name "$SERVED_MODEL_NAME"
+    --spectral-calibration "$SPECTRAL_SIDECAR"
+    --spectral-quantize
+    --kv-cache-dtype fp8_e4m3
+    --max-model-len "$MAX_MODEL_LEN"
+    --max-num-batched-tokens "$MAX_NUM_BATCHED_TOKENS"
+    --max-num-seqs "$MAX_NUM_SEQS"
+    --gpu-memory-utilization "$GPU_MEMORY_UTILIZATION"
+    --compilation-config "{\"compile_sizes\": []}"
+    --speculative-config "{\"model\":\"$DRAFT\",\"num_speculative_tokens\":$NUM_SPEC_TOKENS,\"method\":\"eagle3\"}"
+  )
+  if [ "$DISABLE_HYBRID_KV_CACHE_MANAGER" = "1" ]; then
+    args+=(--disable-hybrid-kv-cache-manager)
+  fi
+  printf '%s\0' "${args[@]}"
+}
+run_server() {
+  prepare_overlay
+  local args=()
+  while IFS= read -r -d '' item; do
+    args+=("$item")
+  done < <(server_args)
+  exec python3 -m vllm.entrypoints.openai.api_server "${args[@]}" "$@"
+}
+wait_for_server() {
+  python3 - <<PY
+import os
+import sys
+import time
+import urllib.request
+pid = int(os.environ["SERVER_PID"])
+port = int(os.environ["PORT"])
+deadline = time.time() + int(os.environ.get("SERVER_TIMEOUT", "300"))
+url = f"http://127.0.0.1:{port}/v1/models"
+while time.time() < deadline:
+    try:
+        os.kill(pid, 0)
+    except OSError:
+        raise SystemExit("server exited early")
+    try:
+        with urllib.request.urlopen(url, timeout=2) as response:
+            if response.status == 200:
+                print("SERVER_READY", flush=True)
+                raise SystemExit(0)
+    except Exception:
+        time.sleep(1)
+raise SystemExit("server did not become ready")
+PY
+}
+start_background_server() {
+  prepare_overlay
+  local args=()
+  HOST=127.0.0.1
+  export HOST
+  while IFS= read -r -d '' item; do
+    args+=("$item")
+  done < <(server_args)
+  python3 -m vllm.entrypoints.openai.api_server "${args[@]}" > "$SERVER_LOG" 2>&1 &
+  SERVER_PID=$!
+  export SERVER_PID PORT
+  trap 'kill "$SERVER_PID" >/dev/null 2>&1 || true; wait "$SERVER_PID" >/dev/null 2>&1 || true' EXIT
+  wait_for_server
+}
+run_smoke_client() {
+  python3 - <<PY
+import json
+import urllib.request
+model = "${SERVED_MODEL_NAME}"
+url = "http://127.0.0.1:${PORT}/v1/chat/completions"
+checks = [
+    ("What is 2+2? Answer with just the number.", "4"),
+    ("Paris is the capital of which country? Answer with one word.", "France"),
+]
+for prompt, expected in checks:
+    payload = {
+        "model": model,
+        "messages": [{"role": "user", "content": prompt}],
+        "max_tokens": 16,
+        "temperature": 0,
+    }
+    request = urllib.request.Request(
+        url,
+        data=json.dumps(payload).encode("utf-8"),
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    with urllib.request.urlopen(request, timeout=120) as response:
+        data = json.load(response)
+    text = data["choices"][0]["message"]["content"].strip()
+    print(f"{prompt} => {text}", flush=True)
+    if expected.lower() not in text.lower():
+        raise SystemExit(
+            f"semantic smoke failed: expected {expected!r} in response {text!r}")
+print("SMOKE_PROMPTS_OK", flush=True)
+PY
+}
+run_smoke() {
+  RUN_ID="${RUN_ID:-smoke_$(date +%Y%m%d_%H%M%S)}"
+  OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
+  mkdir -p "$OUT"
+  SERVER_LOG="$OUT/server.log"
+  start_background_server
+  run_smoke_client | tee "$OUT/smoke_outputs.txt"
+  echo "SMOKE_OUT=$OUT"
+}
+run_bench() {
+  RUN_ID="${RUN_ID:-tokens_sec_phase2_eagle_$(date +%Y%m%d_%H%M%S)}"
+  OUT="${RESULTS_DIR:-$RESULTS_ROOT/$RUN_ID}"
+  mkdir -p "$OUT"
+  SERVER_LOG="$OUT/server.log"
+  start_background_server
+  if [ "${RUN_SMOKE:-0}" = "1" ]; then
+    run_smoke_client | tee "$OUT/smoke_outputs.txt"
+  fi
+  if [ "${SMOKE_ONLY:-0}" = "1" ]; then
+    echo "SMOKE_ONLY=1; skipping benchmark"
+    echo "BENCH_OUT=$OUT"
+    exit 0
+  fi
+  python3 -m vllm.entrypoints.cli.main bench serve \
+    --backend openai-chat \
+    --base-url "http://127.0.0.1:$PORT" \
+    --endpoint /v1/chat/completions \
+    --model "$SERVED_MODEL_NAME" \
+    --tokenizer "$MODEL" \
+    --dataset-name random \
+    --random-input-len "${INPUT_LEN:-128}" \
+    --random-output-len "${OUTPUT_LEN:-32}" \
+    --num-prompts "${NUM_PROMPTS:-8}" \
+    --num-warmups "${NUM_WARMUPS:-1}" \
+    --request-rate "${REQUEST_RATE:-inf}" \
+    --temperature 0 \
+    --ignore-eos \
+    --disable-tqdm \
+    --save-result \
+    --result-dir "$OUT" \
+    --result-filename bench.json \
+    2>&1 | tee "$OUT/bench.log"
+  echo "BENCH_OUT=$OUT"
+}
+case "$COMMAND" in
+  serve)
+    run_server "$@"
+    ;;
+  smoke)
+    run_smoke
+    ;;
+  bench)
+    run_bench
+    ;;
+  bash|sh)
+    exec "$COMMAND" "$@"
+    ;;
+  *)
+    exec "$COMMAND" "$@"
+    ;;
+esac

manifest.json CHANGED Viewed

@@ -23,9 +23,18 @@
     "scripts/setup_repro_from_hf.sh",
     "scripts/serve_phase2_eagle.sh",
     "scripts/bench_tokens_sec_phase2_eagle.sh",
     "scripts/test_triton_codebook_match.py",
     "scripts/measure_kv_cache_compression.py"
   ],
   "recommended_runtime_env": {
     "SPECTRAL_CUDA_GRAPH": "1",
     "SPECTRAL_TRITON_COMPRESS": "1",

     "scripts/setup_repro_from_hf.sh",
     "scripts/serve_phase2_eagle.sh",
     "scripts/bench_tokens_sec_phase2_eagle.sh",
+    "scripts/build_docker_image.sh",
     "scripts/test_triton_codebook_match.py",
     "scripts/measure_kv_cache_compression.py"
   ],
+  "docker_image_build": {
+    "dockerfile": "docker/Dockerfile",
+    "entrypoint": "docker/entrypoint.sh",
+    "downloads_model_weights_at_runtime": true,
+    "includes_sidecar_by_default": true,
+    "optional_no_sidecar_build_arg": "INCLUDE_SIDECAR=0",
+    "default_image_tag": "gemmacut-spectral:008dd7f87"
+  },
   "recommended_runtime_env": {
     "SPECTRAL_CUDA_GRAPH": "1",
     "SPECTRAL_TRITON_COMPRESS": "1",

scripts/build_docker_image.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+#!/usr/bin/env bash
+# Build the no-weights GemmaCut SpectralQuant runtime image.
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUNDLE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+IMAGE="${IMAGE:-gemmacut-spectral:008dd7f87}"
+docker build \
+  -f "$BUNDLE_DIR/docker/Dockerfile" \
+  -t "$IMAGE" \
+  "$@" \
+  "$BUNDLE_DIR"
+echo "Built $IMAGE"