3d_model / .github /workflows /lambda-gpu-smoke.yml
Azan
Clean deployment build (Squashed)
7a87926
name: Lambda GPU Smoke Test
on:
workflow_dispatch:
inputs:
image_tag:
description: "ECR tag to test (e.g. latest, main, dev, auto)"
required: false
default: "auto"
region:
description: "Lambda Cloud region (e.g. us-east-1, us-west-1)"
required: false
default: "us-east-1"
instance_type:
description: "Lambda Cloud instance type name (e.g. gpu_1x_a10, gpu_1x_h100_pcie)"
required: false
default: "gpu_1x_a10"
health_timeout_s:
description: "Seconds to wait for /health to become 200"
required: false
default: "2400"
timeout_s:
description: "Seconds to wait for smoke jobs"
required: false
default: "1800"
env:
AWS_REGION: us-east-1
ECR_REPOSITORY: ylff
LAMBDA_API_BASE: https://cloud.lambda.ai/api/v1
SMOKE_MODEL: "depth-anything/DA3Metric-LARGE"
SERVER_PORT: "8000"
permissions:
contents: read
id-token: write
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
smoke:
runs-on: ubuntu-latest
timeout-minutes: 90
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install test dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest requests
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::211125621822:role/github-actions-role
aws-region: ${{ env.AWS_REGION }}
role-session-name: GitHubActionsSession
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Resolve image
id: img
run: |
set -euo pipefail
TAG="${{ github.event.inputs.image_tag }}"
if [ -z "${TAG}" ]; then
TAG="auto"
fi
BRANCH="${GITHUB_REF_NAME}"
SHORT_SHA="${GITHUB_SHA::7}"
CANDIDATE_TAG="${BRANCH}-${SHORT_SHA}"
if [ "${TAG}" = "latest" ] || [ "${TAG}" = "auto" ]; then
if aws ecr describe-images \
--repository-name "${{ env.ECR_REPOSITORY }}" \
--image-ids "imageTag=${CANDIDATE_TAG}" \
--region "${{ env.AWS_REGION }}" >/dev/null 2>&1; then
echo "Using immutable ECR tag: ${CANDIDATE_TAG}"
TAG="${CANDIDATE_TAG}"
else
if [ "${TAG}" = "auto" ]; then
TAG="latest"
fi
echo "Immutable tag not found (${CANDIDATE_TAG}); using tag: ${TAG}"
fi
fi
FULL_IMAGE="${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${TAG}"
echo "image_tag=${TAG}" >> "$GITHUB_OUTPUT"
echo "full_image=${FULL_IMAGE}" >> "$GITHUB_OUTPUT"
echo "Using image: ${FULL_IMAGE}"
- name: Get ECR login password (for remote instance)
id: ecrpw
run: |
set -euo pipefail
PW="$(aws ecr get-login-password --region "${{ env.AWS_REGION }}")"
if [ -z "${PW}" ]; then
echo "Failed to obtain ECR login password"
exit 1
fi
echo "::add-mask::${PW}"
echo "ecr_password=${PW}" >> "$GITHUB_OUTPUT"
- name: Create ephemeral Lambda SSH key
id: lambda-ssh
env:
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }}
run: |
set -euo pipefail
if [ -z "${LAMBDA_LABS_KEY:-}" ]; then
echo "Missing secret: LAMBDA_LABS_KEY"
exit 1
fi
KEY_NAME="ylff-gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
KEY_DIR="$(mktemp -d)"
KEY_PATH="${KEY_DIR}/id_ed25519"
ssh-keygen -t ed25519 -N "" -f "${KEY_PATH}" >/dev/null
PUB="$(cat "${KEY_PATH}.pub")"
RESP="$(curl -sS --fail \
--request POST \
--url "${{ env.LAMBDA_API_BASE }}/ssh-keys" \
--header 'accept: application/json' \
--user "${LAMBDA_LABS_KEY}:" \
--data "$(jq -nc --arg name "${KEY_NAME}" --arg pub "${PUB}" '{name:$name, public_key:$pub}')")"
SSH_KEY_ID="$(echo "${RESP}" | jq -r '.data.id // empty')"
if [ -z "${SSH_KEY_ID}" ]; then
echo "Failed to create Lambda SSH key. Response: ${RESP}"
exit 1
fi
echo "ssh_key_name=${KEY_NAME}" >> "$GITHUB_OUTPUT"
echo "ssh_key_id=${SSH_KEY_ID}" >> "$GITHUB_OUTPUT"
echo "ssh_private_key_path=${KEY_PATH}" >> "$GITHUB_OUTPUT"
- name: Create ephemeral Lambda firewall ruleset (22 + 8000)
id: lambda-fw
env:
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }}
run: |
set -euo pipefail
REGION="${{ github.event.inputs.region }}"
NAME="ylff-gha-fw-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
BODY="$(jq -nc \
--arg name "${NAME}" \
--arg region "${REGION}" \
'{
name: $name,
region: $region,
rules: [
{ protocol: "tcp", port_range: [22,22], source_network: "0.0.0.0/0", description: "SSH" },
{ protocol: "tcp", port_range: [8000,8000], source_network: "0.0.0.0/0", description: "YLFF API" }
]
}')"
RESP="$(curl -sS --fail \
--request POST \
--url "${{ env.LAMBDA_API_BASE }}/firewall-rulesets" \
--header 'accept: application/json' \
--user "${LAMBDA_LABS_KEY}:" \
--data "${BODY}")"
FW_ID="$(echo "${RESP}" | jq -r '.data.id // empty')"
if [ -z "${FW_ID}" ]; then
echo "Failed to create firewall ruleset. Response: ${RESP}"
exit 1
fi
echo "fw_id=${FW_ID}" >> "$GITHUB_OUTPUT"
echo "fw_name=${NAME}" >> "$GITHUB_OUTPUT"
- name: Launch Lambda instance
id: lambda-launch
env:
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }}
run: |
set -euo pipefail
REGION="${{ github.event.inputs.region }}"
INSTANCE_TYPE="${{ github.event.inputs.instance_type }}"
SSH_KEY_NAME="${{ steps.lambda-ssh.outputs.ssh_key_name }}"
FW_ID="${{ steps.lambda-fw.outputs.fw_id }}"
NAME="ylff-gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
BODY="$(jq -nc \
--arg region "${REGION}" \
--arg it "${INSTANCE_TYPE}" \
--arg name "${NAME}" \
--arg ssh "${SSH_KEY_NAME}" \
--arg fw "${FW_ID}" \
'{
region_name: $region,
instance_type_name: $it,
ssh_key_names: [$ssh],
file_system_names: [],
name: $name,
firewall_rulesets: [{id: $fw}]
}')"
RESP="$(curl -sS --fail \
--request POST \
--url "${{ env.LAMBDA_API_BASE }}/instance-operations/launch" \
--header 'accept: application/json' \
--user "${LAMBDA_LABS_KEY}:" \
--data "${BODY}")"
INSTANCE_ID="$(echo "${RESP}" | jq -r '.data.instance_ids[0] // empty')"
if [ -z "${INSTANCE_ID}" ]; then
echo "Failed to launch instance. Response: ${RESP}"
exit 1
fi
echo "instance_id=${INSTANCE_ID}" >> "$GITHUB_OUTPUT"
- name: Wait for Lambda instance to become active + get IP
id: lambda-wait
run: |
set -euo pipefail
INSTANCE_ID="${{ steps.lambda-launch.outputs.instance_id }}"
python - <<'PY'
import os
import time
import requests
base = os.environ["LAMBDA_API_BASE"].rstrip("/")
instance_id = os.environ["INSTANCE_ID"]
api_key = os.environ["LAMBDA_LABS_KEY"]
url = f"{base}/instances/{instance_id}"
deadline = time.time() + 20 * 60
ip = None
last = None
while time.time() < deadline:
r = requests.get(url, headers={"accept": "application/json"}, auth=(api_key, ""))
if r.status_code >= 400:
last = (r.status_code, r.text[:500])
time.sleep(2.0)
continue
data = (r.json() or {}).get("data") or {}
status = data.get("status")
ip = data.get("ip")
last = {"status": status, "ip": ip}
if status == "active" and ip:
print(ip)
break
time.sleep(3.0) # API is rate-limited; keep this gentle.
else:
raise SystemExit(f"Timed out waiting for instance to become active. last={last!r}")
out = os.environ["GITHUB_OUTPUT"]
with open(out, "a", encoding="utf-8") as f:
f.write(f"instance_ip={ip}\n")
PY
env:
LAMBDA_API_BASE: ${{ env.LAMBDA_API_BASE }}
INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }}
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }}
- name: SSH bootstrap + run container
id: lambda-remote
env:
INSTANCE_IP: ${{ steps.lambda-wait.outputs.instance_ip }}
KEY_PATH: ${{ steps.lambda-ssh.outputs.ssh_private_key_path }}
FULL_IMAGE: ${{ steps.img.outputs.full_image }}
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_PASSWORD: ${{ steps.ecrpw.outputs.ecr_password }}
SERVER_PORT: ${{ env.SERVER_PORT }}
run: |
set -euo pipefail
# Wait for SSH to accept connections
for i in {1..60}; do
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
-i "${KEY_PATH}" ubuntu@"${INSTANCE_IP}" "echo ok" >/dev/null 2>&1; then
break
fi
sleep 5
done
# Run remote bootstrap + start API
#
# NOTE: We pass ECR credentials and image as inline env vars for the remote shell
# (Lambda instance won't have these set).
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-i "${KEY_PATH}" ubuntu@"${INSTANCE_IP}" \
"ECR_PASSWORD='${ECR_PASSWORD}' ECR_REGISTRY='${ECR_REGISTRY}' FULL_IMAGE='${FULL_IMAGE}' SERVER_PORT='${SERVER_PORT}' bash -lc $(printf %q "$(cat <<'BASH'
set -euo pipefail
echo "Checking docker..."
if ! command -v docker >/dev/null 2>&1; then
echo "docker not found; installing"
sudo apt-get update -y
sudo apt-get install -y docker.io
fi
sudo systemctl enable --now docker || true
# ECR login (runner provides short-lived password)
echo "${ECR_PASSWORD}" | sudo docker login --username AWS --password-stdin "${ECR_REGISTRY}"
# Pull and run image (explicit uvicorn command for consistency with RunPod template)
sudo docker pull "${FULL_IMAGE}"
sudo docker rm -f ylff || true
# Provide a stable cache volume similar to RunPod's /workspace.
sudo mkdir -p /workspace/.cache
sudo docker run -d --restart=unless-stopped \
--gpus all \
--name ylff \
-p ${SERVER_PORT}:8000 \
-v /workspace:/workspace \
-e PYTHONUNBUFFERED=1 \
-e PYTHONPATH=/app \
-e XDG_CACHE_HOME=/workspace/.cache \
-e HF_HOME=/workspace/.cache/huggingface \
-e HUGGINGFACE_HUB_CACHE=/workspace/.cache/huggingface/hub \
-e TRANSFORMERS_CACHE=/workspace/.cache/huggingface/transformers \
-e TORCH_HOME=/workspace/.cache/torch \
"${FULL_IMAGE}" \
python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000 --log-level info --access-log
echo "Container started. Recent logs:"
sudo docker logs --tail 50 ylff || true
BASH
)")"
- name: Wait for API health
env:
BASE_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/
HEALTH_TIMEOUT_S: ${{ github.event.inputs.health_timeout_s }}
run: |
set -e
python - <<'PY'
import os
import time
import requests
from urllib.parse import urljoin
base = os.environ["BASE_URL"].rstrip("/") + "/"
timeout_s = int((os.environ.get("HEALTH_TIMEOUT_S") or "2400").strip())
url = urljoin(base, "health")
start = time.time()
last = None
print(f"Polling {url} (timeout={timeout_s}s) ...", flush=True)
while True:
elapsed = int(time.time() - start)
try:
r = requests.get(url, timeout=10)
last = (r.status_code, (r.text or "")[:300])
if r.status_code == 200:
print("API is healthy.", flush=True)
raise SystemExit(0)
except Exception as e:
last = ("error", repr(e))
if elapsed >= timeout_s:
break
time.sleep(5)
raise SystemExit(f"Timed out waiting for /health. last={last!r}")
PY
- name: Run remote smoke pytest
env:
RUNPOD_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/
YLFF_SMOKE_DEVICE: "cuda"
YLFF_SMOKE_MODEL: ${{ env.SMOKE_MODEL }}
YLFF_SMOKE_TIMEOUT_S: ${{ github.event.inputs.timeout_s }}
# Lambda GPU names vary by region/capacity; don't assert a strict substring by default.
YLFF_EXPECT_GPU_SUBSTR: ""
YLFF_RUN_INFERENCE_PIPELINE_SMOKE: "1"
YLFF_SMOKE_PIPELINE_SAMPLE: "arkitscenes_40753679_clip"
run: |
pytest -q \
tests/test_remote_runpod_smoke.py \
tests/test_remote_runpod_train_smoke.py
- name: Lambda smoke summary
if: always()
env:
BASE_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/
FULL_IMAGE: ${{ steps.img.outputs.full_image }}
REGION: ${{ github.event.inputs.region }}
INSTANCE_TYPE: ${{ github.event.inputs.instance_type }}
INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }}
run: |
{
echo "## Lambda GPU Smoke Summary"
echo ""
echo "- **Instance ID**: \`${INSTANCE_ID}\`"
echo "- **Region**: \`${REGION}\`"
echo "- **Instance type**: \`${INSTANCE_TYPE}\`"
echo "- **Base URL**: ${BASE_URL}"
echo "- **Docker image**: \`${FULL_IMAGE}\`"
echo ""
echo "- **Lambda Cloud API docs**: https://docs-api.lambda.ai/api/cloud"
echo ""
} >> "$GITHUB_STEP_SUMMARY"
- name: Cleanup (terminate instance + delete firewall ruleset + delete SSH key)
if: always()
env:
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }}
INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }}
FW_ID: ${{ steps.lambda-fw.outputs.fw_id }}
SSH_KEY_ID: ${{ steps.lambda-ssh.outputs.ssh_key_id }}
run: |
set +euo pipefail
if [ -n "${INSTANCE_ID}" ]; then
curl -sS --fail \
--request POST \
--url "${{ env.LAMBDA_API_BASE }}/instance-operations/terminate" \
--header 'accept: application/json' \
--user "${LAMBDA_LABS_KEY}:" \
--data "$(jq -nc --arg id "${INSTANCE_ID}" '{instance_ids: [$id]}')" \
|| true
fi
if [ -n "${FW_ID}" ]; then
curl -sS --fail \
--request DELETE \
--url "${{ env.LAMBDA_API_BASE }}/firewall-rulesets/${FW_ID}" \
--header 'accept: application/json' \
--user "${LAMBDA_LABS_KEY}:" \
|| true
fi
if [ -n "${SSH_KEY_ID}" ]; then
curl -sS --fail \
--request DELETE \
--url "${{ env.LAMBDA_API_BASE }}/ssh-keys/${SSH_KEY_ID}" \
--header 'accept: application/json' \
--user "${LAMBDA_LABS_KEY}:" \
|| true
fi