|
|
name: Lambda GPU Smoke Test |
|
|
|
|
|
on: |
|
|
workflow_dispatch: |
|
|
inputs: |
|
|
image_tag: |
|
|
description: "ECR tag to test (e.g. latest, main, dev, auto)" |
|
|
required: false |
|
|
default: "auto" |
|
|
region: |
|
|
description: "Lambda Cloud region (e.g. us-east-1, us-west-1)" |
|
|
required: false |
|
|
default: "us-east-1" |
|
|
instance_type: |
|
|
description: "Lambda Cloud instance type name (e.g. gpu_1x_a10, gpu_1x_h100_pcie)" |
|
|
required: false |
|
|
default: "gpu_1x_a10" |
|
|
health_timeout_s: |
|
|
description: "Seconds to wait for /health to become 200" |
|
|
required: false |
|
|
default: "2400" |
|
|
timeout_s: |
|
|
description: "Seconds to wait for smoke jobs" |
|
|
required: false |
|
|
default: "1800" |
|
|
|
|
|
env: |
|
|
AWS_REGION: us-east-1 |
|
|
ECR_REPOSITORY: ylff |
|
|
LAMBDA_API_BASE: https://cloud.lambda.ai/api/v1 |
|
|
SMOKE_MODEL: "depth-anything/DA3Metric-LARGE" |
|
|
SERVER_PORT: "8000" |
|
|
|
|
|
permissions: |
|
|
contents: read |
|
|
id-token: write |
|
|
|
|
|
concurrency: |
|
|
group: ${{ github.workflow }}-${{ github.ref }} |
|
|
cancel-in-progress: true |
|
|
|
|
|
jobs: |
|
|
smoke: |
|
|
runs-on: ubuntu-latest |
|
|
timeout-minutes: 90 |
|
|
|
|
|
steps: |
|
|
- name: Checkout repository |
|
|
uses: actions/checkout@v4 |
|
|
with: |
|
|
lfs: true |
|
|
|
|
|
- name: Set up Python |
|
|
uses: actions/setup-python@v5 |
|
|
with: |
|
|
python-version: "3.11" |
|
|
|
|
|
- name: Install test dependencies |
|
|
run: | |
|
|
python -m pip install --upgrade pip |
|
|
pip install -r requirements.txt |
|
|
pip install pytest requests |
|
|
|
|
|
- name: Configure AWS credentials |
|
|
uses: aws-actions/configure-aws-credentials@v4 |
|
|
with: |
|
|
role-to-assume: arn:aws:iam::211125621822:role/github-actions-role |
|
|
aws-region: ${{ env.AWS_REGION }} |
|
|
role-session-name: GitHubActionsSession |
|
|
|
|
|
- name: Login to Amazon ECR |
|
|
id: login-ecr |
|
|
uses: aws-actions/amazon-ecr-login@v2 |
|
|
|
|
|
- name: Resolve image |
|
|
id: img |
|
|
run: | |
|
|
set -euo pipefail |
|
|
TAG="${{ github.event.inputs.image_tag }}" |
|
|
if [ -z "${TAG}" ]; then |
|
|
TAG="auto" |
|
|
fi |
|
|
|
|
|
BRANCH="${GITHUB_REF_NAME}" |
|
|
SHORT_SHA="${GITHUB_SHA::7}" |
|
|
CANDIDATE_TAG="${BRANCH}-${SHORT_SHA}" |
|
|
|
|
|
if [ "${TAG}" = "latest" ] || [ "${TAG}" = "auto" ]; then |
|
|
if aws ecr describe-images \ |
|
|
--repository-name "${{ env.ECR_REPOSITORY }}" \ |
|
|
--image-ids "imageTag=${CANDIDATE_TAG}" \ |
|
|
--region "${{ env.AWS_REGION }}" >/dev/null 2>&1; then |
|
|
echo "Using immutable ECR tag: ${CANDIDATE_TAG}" |
|
|
TAG="${CANDIDATE_TAG}" |
|
|
else |
|
|
if [ "${TAG}" = "auto" ]; then |
|
|
TAG="latest" |
|
|
fi |
|
|
echo "Immutable tag not found (${CANDIDATE_TAG}); using tag: ${TAG}" |
|
|
fi |
|
|
fi |
|
|
|
|
|
FULL_IMAGE="${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${TAG}" |
|
|
echo "image_tag=${TAG}" >> "$GITHUB_OUTPUT" |
|
|
echo "full_image=${FULL_IMAGE}" >> "$GITHUB_OUTPUT" |
|
|
echo "Using image: ${FULL_IMAGE}" |
|
|
|
|
|
- name: Get ECR login password (for remote instance) |
|
|
id: ecrpw |
|
|
run: | |
|
|
set -euo pipefail |
|
|
PW="$(aws ecr get-login-password --region "${{ env.AWS_REGION }}")" |
|
|
if [ -z "${PW}" ]; then |
|
|
echo "Failed to obtain ECR login password" |
|
|
exit 1 |
|
|
fi |
|
|
echo "::add-mask::${PW}" |
|
|
echo "ecr_password=${PW}" >> "$GITHUB_OUTPUT" |
|
|
|
|
|
- name: Create ephemeral Lambda SSH key |
|
|
id: lambda-ssh |
|
|
env: |
|
|
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} |
|
|
run: | |
|
|
set -euo pipefail |
|
|
if [ -z "${LAMBDA_LABS_KEY:-}" ]; then |
|
|
echo "Missing secret: LAMBDA_LABS_KEY" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
KEY_NAME="ylff-gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" |
|
|
KEY_DIR="$(mktemp -d)" |
|
|
KEY_PATH="${KEY_DIR}/id_ed25519" |
|
|
|
|
|
ssh-keygen -t ed25519 -N "" -f "${KEY_PATH}" >/dev/null |
|
|
PUB="$(cat "${KEY_PATH}.pub")" |
|
|
|
|
|
RESP="$(curl -sS --fail \ |
|
|
--request POST \ |
|
|
--url "${{ env.LAMBDA_API_BASE }}/ssh-keys" \ |
|
|
--header 'accept: application/json' \ |
|
|
--user "${LAMBDA_LABS_KEY}:" \ |
|
|
--data "$(jq -nc --arg name "${KEY_NAME}" --arg pub "${PUB}" '{name:$name, public_key:$pub}')")" |
|
|
|
|
|
SSH_KEY_ID="$(echo "${RESP}" | jq -r '.data.id // empty')" |
|
|
if [ -z "${SSH_KEY_ID}" ]; then |
|
|
echo "Failed to create Lambda SSH key. Response: ${RESP}" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "ssh_key_name=${KEY_NAME}" >> "$GITHUB_OUTPUT" |
|
|
echo "ssh_key_id=${SSH_KEY_ID}" >> "$GITHUB_OUTPUT" |
|
|
echo "ssh_private_key_path=${KEY_PATH}" >> "$GITHUB_OUTPUT" |
|
|
|
|
|
- name: Create ephemeral Lambda firewall ruleset (22 + 8000) |
|
|
id: lambda-fw |
|
|
env: |
|
|
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} |
|
|
run: | |
|
|
set -euo pipefail |
|
|
REGION="${{ github.event.inputs.region }}" |
|
|
NAME="ylff-gha-fw-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" |
|
|
|
|
|
BODY="$(jq -nc \ |
|
|
--arg name "${NAME}" \ |
|
|
--arg region "${REGION}" \ |
|
|
'{ |
|
|
name: $name, |
|
|
region: $region, |
|
|
rules: [ |
|
|
{ protocol: "tcp", port_range: [22,22], source_network: "0.0.0.0/0", description: "SSH" }, |
|
|
{ protocol: "tcp", port_range: [8000,8000], source_network: "0.0.0.0/0", description: "YLFF API" } |
|
|
] |
|
|
}')" |
|
|
|
|
|
RESP="$(curl -sS --fail \ |
|
|
--request POST \ |
|
|
--url "${{ env.LAMBDA_API_BASE }}/firewall-rulesets" \ |
|
|
--header 'accept: application/json' \ |
|
|
--user "${LAMBDA_LABS_KEY}:" \ |
|
|
--data "${BODY}")" |
|
|
|
|
|
FW_ID="$(echo "${RESP}" | jq -r '.data.id // empty')" |
|
|
if [ -z "${FW_ID}" ]; then |
|
|
echo "Failed to create firewall ruleset. Response: ${RESP}" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "fw_id=${FW_ID}" >> "$GITHUB_OUTPUT" |
|
|
echo "fw_name=${NAME}" >> "$GITHUB_OUTPUT" |
|
|
|
|
|
- name: Launch Lambda instance |
|
|
id: lambda-launch |
|
|
env: |
|
|
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} |
|
|
run: | |
|
|
set -euo pipefail |
|
|
REGION="${{ github.event.inputs.region }}" |
|
|
INSTANCE_TYPE="${{ github.event.inputs.instance_type }}" |
|
|
SSH_KEY_NAME="${{ steps.lambda-ssh.outputs.ssh_key_name }}" |
|
|
FW_ID="${{ steps.lambda-fw.outputs.fw_id }}" |
|
|
|
|
|
NAME="ylff-gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" |
|
|
|
|
|
BODY="$(jq -nc \ |
|
|
--arg region "${REGION}" \ |
|
|
--arg it "${INSTANCE_TYPE}" \ |
|
|
--arg name "${NAME}" \ |
|
|
--arg ssh "${SSH_KEY_NAME}" \ |
|
|
--arg fw "${FW_ID}" \ |
|
|
'{ |
|
|
region_name: $region, |
|
|
instance_type_name: $it, |
|
|
ssh_key_names: [$ssh], |
|
|
file_system_names: [], |
|
|
name: $name, |
|
|
firewall_rulesets: [{id: $fw}] |
|
|
}')" |
|
|
|
|
|
RESP="$(curl -sS --fail \ |
|
|
--request POST \ |
|
|
--url "${{ env.LAMBDA_API_BASE }}/instance-operations/launch" \ |
|
|
--header 'accept: application/json' \ |
|
|
--user "${LAMBDA_LABS_KEY}:" \ |
|
|
--data "${BODY}")" |
|
|
|
|
|
INSTANCE_ID="$(echo "${RESP}" | jq -r '.data.instance_ids[0] // empty')" |
|
|
if [ -z "${INSTANCE_ID}" ]; then |
|
|
echo "Failed to launch instance. Response: ${RESP}" |
|
|
exit 1 |
|
|
fi |
|
|
|
|
|
echo "instance_id=${INSTANCE_ID}" >> "$GITHUB_OUTPUT" |
|
|
|
|
|
- name: Wait for Lambda instance to become active + get IP |
|
|
id: lambda-wait |
|
|
run: | |
|
|
set -euo pipefail |
|
|
INSTANCE_ID="${{ steps.lambda-launch.outputs.instance_id }}" |
|
|
|
|
|
python - <<'PY' |
|
|
import os |
|
|
import time |
|
|
import requests |
|
|
|
|
|
base = os.environ["LAMBDA_API_BASE"].rstrip("/") |
|
|
instance_id = os.environ["INSTANCE_ID"] |
|
|
api_key = os.environ["LAMBDA_LABS_KEY"] |
|
|
|
|
|
url = f"{base}/instances/{instance_id}" |
|
|
deadline = time.time() + 20 * 60 |
|
|
|
|
|
ip = None |
|
|
last = None |
|
|
while time.time() < deadline: |
|
|
r = requests.get(url, headers={"accept": "application/json"}, auth=(api_key, "")) |
|
|
if r.status_code >= 400: |
|
|
last = (r.status_code, r.text[:500]) |
|
|
time.sleep(2.0) |
|
|
continue |
|
|
data = (r.json() or {}).get("data") or {} |
|
|
status = data.get("status") |
|
|
ip = data.get("ip") |
|
|
last = {"status": status, "ip": ip} |
|
|
if status == "active" and ip: |
|
|
print(ip) |
|
|
break |
|
|
time.sleep(3.0) |
|
|
else: |
|
|
raise SystemExit(f"Timed out waiting for instance to become active. last={last!r}") |
|
|
|
|
|
out = os.environ["GITHUB_OUTPUT"] |
|
|
with open(out, "a", encoding="utf-8") as f: |
|
|
f.write(f"instance_ip={ip}\n") |
|
|
PY |
|
|
env: |
|
|
LAMBDA_API_BASE: ${{ env.LAMBDA_API_BASE }} |
|
|
INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }} |
|
|
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} |
|
|
|
|
|
- name: SSH bootstrap + run container |
|
|
id: lambda-remote |
|
|
env: |
|
|
INSTANCE_IP: ${{ steps.lambda-wait.outputs.instance_ip }} |
|
|
KEY_PATH: ${{ steps.lambda-ssh.outputs.ssh_private_key_path }} |
|
|
FULL_IMAGE: ${{ steps.img.outputs.full_image }} |
|
|
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} |
|
|
ECR_PASSWORD: ${{ steps.ecrpw.outputs.ecr_password }} |
|
|
SERVER_PORT: ${{ env.SERVER_PORT }} |
|
|
run: | |
|
|
set -euo pipefail |
|
|
|
|
|
|
|
|
for i in {1..60}; do |
|
|
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \ |
|
|
-i "${KEY_PATH}" ubuntu@"${INSTANCE_IP}" "echo ok" >/dev/null 2>&1; then |
|
|
break |
|
|
fi |
|
|
sleep 5 |
|
|
done |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ |
|
|
-i "${KEY_PATH}" ubuntu@"${INSTANCE_IP}" \ |
|
|
"ECR_PASSWORD='${ECR_PASSWORD}' ECR_REGISTRY='${ECR_REGISTRY}' FULL_IMAGE='${FULL_IMAGE}' SERVER_PORT='${SERVER_PORT}' bash -lc $(printf %q "$(cat <<'BASH' |
|
|
set -euo pipefail |
|
|
|
|
|
echo "Checking docker..." |
|
|
if ! command -v docker >/dev/null 2>&1; then |
|
|
echo "docker not found; installing" |
|
|
sudo apt-get update -y |
|
|
sudo apt-get install -y docker.io |
|
|
fi |
|
|
sudo systemctl enable --now docker || true |
|
|
|
|
|
|
|
|
echo "${ECR_PASSWORD}" | sudo docker login --username AWS --password-stdin "${ECR_REGISTRY}" |
|
|
|
|
|
|
|
|
sudo docker pull "${FULL_IMAGE}" |
|
|
sudo docker rm -f ylff || true |
|
|
|
|
|
|
|
|
sudo mkdir -p /workspace/.cache |
|
|
|
|
|
sudo docker run -d --restart=unless-stopped \ |
|
|
--gpus all \ |
|
|
--name ylff \ |
|
|
-p ${SERVER_PORT}:8000 \ |
|
|
-v /workspace:/workspace \ |
|
|
-e PYTHONUNBUFFERED=1 \ |
|
|
-e PYTHONPATH=/app \ |
|
|
-e XDG_CACHE_HOME=/workspace/.cache \ |
|
|
-e HF_HOME=/workspace/.cache/huggingface \ |
|
|
-e HUGGINGFACE_HUB_CACHE=/workspace/.cache/huggingface/hub \ |
|
|
-e TRANSFORMERS_CACHE=/workspace/.cache/huggingface/transformers \ |
|
|
-e TORCH_HOME=/workspace/.cache/torch \ |
|
|
"${FULL_IMAGE}" \ |
|
|
python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000 --log-level info --access-log |
|
|
|
|
|
echo "Container started. Recent logs:" |
|
|
sudo docker logs --tail 50 ylff || true |
|
|
BASH |
|
|
)")" |
|
|
|
|
|
- name: Wait for API health |
|
|
env: |
|
|
BASE_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/ |
|
|
HEALTH_TIMEOUT_S: ${{ github.event.inputs.health_timeout_s }} |
|
|
run: | |
|
|
set -e |
|
|
python - <<'PY' |
|
|
import os |
|
|
import time |
|
|
import requests |
|
|
from urllib.parse import urljoin |
|
|
|
|
|
base = os.environ["BASE_URL"].rstrip("/") + "/" |
|
|
timeout_s = int((os.environ.get("HEALTH_TIMEOUT_S") or "2400").strip()) |
|
|
url = urljoin(base, "health") |
|
|
|
|
|
start = time.time() |
|
|
last = None |
|
|
print(f"Polling {url} (timeout={timeout_s}s) ...", flush=True) |
|
|
while True: |
|
|
elapsed = int(time.time() - start) |
|
|
try: |
|
|
r = requests.get(url, timeout=10) |
|
|
last = (r.status_code, (r.text or "")[:300]) |
|
|
if r.status_code == 200: |
|
|
print("API is healthy.", flush=True) |
|
|
raise SystemExit(0) |
|
|
except Exception as e: |
|
|
last = ("error", repr(e)) |
|
|
if elapsed >= timeout_s: |
|
|
break |
|
|
time.sleep(5) |
|
|
raise SystemExit(f"Timed out waiting for /health. last={last!r}") |
|
|
PY |
|
|
|
|
|
- name: Run remote smoke pytest |
|
|
env: |
|
|
RUNPOD_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/ |
|
|
YLFF_SMOKE_DEVICE: "cuda" |
|
|
YLFF_SMOKE_MODEL: ${{ env.SMOKE_MODEL }} |
|
|
YLFF_SMOKE_TIMEOUT_S: ${{ github.event.inputs.timeout_s }} |
|
|
|
|
|
YLFF_EXPECT_GPU_SUBSTR: "" |
|
|
YLFF_RUN_INFERENCE_PIPELINE_SMOKE: "1" |
|
|
YLFF_SMOKE_PIPELINE_SAMPLE: "arkitscenes_40753679_clip" |
|
|
run: | |
|
|
pytest -q \ |
|
|
tests/test_remote_runpod_smoke.py \ |
|
|
tests/test_remote_runpod_train_smoke.py |
|
|
|
|
|
- name: Lambda smoke summary |
|
|
if: always() |
|
|
env: |
|
|
BASE_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/ |
|
|
FULL_IMAGE: ${{ steps.img.outputs.full_image }} |
|
|
REGION: ${{ github.event.inputs.region }} |
|
|
INSTANCE_TYPE: ${{ github.event.inputs.instance_type }} |
|
|
INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }} |
|
|
run: | |
|
|
{ |
|
|
echo "## Lambda GPU Smoke Summary" |
|
|
echo "" |
|
|
echo "- **Instance ID**: \`${INSTANCE_ID}\`" |
|
|
echo "- **Region**: \`${REGION}\`" |
|
|
echo "- **Instance type**: \`${INSTANCE_TYPE}\`" |
|
|
echo "- **Base URL**: ${BASE_URL}" |
|
|
echo "- **Docker image**: \`${FULL_IMAGE}\`" |
|
|
echo "" |
|
|
echo "- **Lambda Cloud API docs**: https://docs-api.lambda.ai/api/cloud" |
|
|
echo "" |
|
|
} >> "$GITHUB_STEP_SUMMARY" |
|
|
|
|
|
- name: Cleanup (terminate instance + delete firewall ruleset + delete SSH key) |
|
|
if: always() |
|
|
env: |
|
|
LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} |
|
|
INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }} |
|
|
FW_ID: ${{ steps.lambda-fw.outputs.fw_id }} |
|
|
SSH_KEY_ID: ${{ steps.lambda-ssh.outputs.ssh_key_id }} |
|
|
run: | |
|
|
set +euo pipefail |
|
|
|
|
|
if [ -n "${INSTANCE_ID}" ]; then |
|
|
curl -sS --fail \ |
|
|
--request POST \ |
|
|
--url "${{ env.LAMBDA_API_BASE }}/instance-operations/terminate" \ |
|
|
--header 'accept: application/json' \ |
|
|
--user "${LAMBDA_LABS_KEY}:" \ |
|
|
--data "$(jq -nc --arg id "${INSTANCE_ID}" '{instance_ids: [$id]}')" \ |
|
|
|| true |
|
|
fi |
|
|
|
|
|
if [ -n "${FW_ID}" ]; then |
|
|
curl -sS --fail \ |
|
|
--request DELETE \ |
|
|
--url "${{ env.LAMBDA_API_BASE }}/firewall-rulesets/${FW_ID}" \ |
|
|
--header 'accept: application/json' \ |
|
|
--user "${LAMBDA_LABS_KEY}:" \ |
|
|
|| true |
|
|
fi |
|
|
|
|
|
if [ -n "${SSH_KEY_ID}" ]; then |
|
|
curl -sS --fail \ |
|
|
--request DELETE \ |
|
|
--url "${{ env.LAMBDA_API_BASE }}/ssh-keys/${SSH_KEY_ID}" \ |
|
|
--header 'accept: application/json' \ |
|
|
--user "${LAMBDA_LABS_KEY}:" \ |
|
|
|| true |
|
|
fi |
|
|
|