name: Lambda GPU Smoke Test on: workflow_dispatch: inputs: image_tag: description: "ECR tag to test (e.g. latest, main, dev, auto)" required: false default: "auto" region: description: "Lambda Cloud region (e.g. us-east-1, us-west-1)" required: false default: "us-east-1" instance_type: description: "Lambda Cloud instance type name (e.g. gpu_1x_a10, gpu_1x_h100_pcie)" required: false default: "gpu_1x_a10" health_timeout_s: description: "Seconds to wait for /health to become 200" required: false default: "2400" timeout_s: description: "Seconds to wait for smoke jobs" required: false default: "1800" env: AWS_REGION: us-east-1 ECR_REPOSITORY: ylff LAMBDA_API_BASE: https://cloud.lambda.ai/api/v1 SMOKE_MODEL: "depth-anything/DA3Metric-LARGE" SERVER_PORT: "8000" permissions: contents: read id-token: write concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: smoke: runs-on: ubuntu-latest timeout-minutes: 90 steps: - name: Checkout repository uses: actions/checkout@v4 with: lfs: true - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install test dependencies run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install pytest requests - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: role-to-assume: arn:aws:iam::211125621822:role/github-actions-role aws-region: ${{ env.AWS_REGION }} role-session-name: GitHubActionsSession - name: Login to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v2 - name: Resolve image id: img run: | set -euo pipefail TAG="${{ github.event.inputs.image_tag }}" if [ -z "${TAG}" ]; then TAG="auto" fi BRANCH="${GITHUB_REF_NAME}" SHORT_SHA="${GITHUB_SHA::7}" CANDIDATE_TAG="${BRANCH}-${SHORT_SHA}" if [ "${TAG}" = "latest" ] || [ "${TAG}" = "auto" ]; then if aws ecr describe-images \ --repository-name "${{ env.ECR_REPOSITORY }}" \ --image-ids "imageTag=${CANDIDATE_TAG}" \ --region "${{ env.AWS_REGION }}" >/dev/null 2>&1; then echo "Using immutable ECR tag: ${CANDIDATE_TAG}" TAG="${CANDIDATE_TAG}" else if [ "${TAG}" = "auto" ]; then TAG="latest" fi echo "Immutable tag not found (${CANDIDATE_TAG}); using tag: ${TAG}" fi fi FULL_IMAGE="${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${TAG}" echo "image_tag=${TAG}" >> "$GITHUB_OUTPUT" echo "full_image=${FULL_IMAGE}" >> "$GITHUB_OUTPUT" echo "Using image: ${FULL_IMAGE}" - name: Get ECR login password (for remote instance) id: ecrpw run: | set -euo pipefail PW="$(aws ecr get-login-password --region "${{ env.AWS_REGION }}")" if [ -z "${PW}" ]; then echo "Failed to obtain ECR login password" exit 1 fi echo "::add-mask::${PW}" echo "ecr_password=${PW}" >> "$GITHUB_OUTPUT" - name: Create ephemeral Lambda SSH key id: lambda-ssh env: LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} run: | set -euo pipefail if [ -z "${LAMBDA_LABS_KEY:-}" ]; then echo "Missing secret: LAMBDA_LABS_KEY" exit 1 fi KEY_NAME="ylff-gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" KEY_DIR="$(mktemp -d)" KEY_PATH="${KEY_DIR}/id_ed25519" ssh-keygen -t ed25519 -N "" -f "${KEY_PATH}" >/dev/null PUB="$(cat "${KEY_PATH}.pub")" RESP="$(curl -sS --fail \ --request POST \ --url "${{ env.LAMBDA_API_BASE }}/ssh-keys" \ --header 'accept: application/json' \ --user "${LAMBDA_LABS_KEY}:" \ --data "$(jq -nc --arg name "${KEY_NAME}" --arg pub "${PUB}" '{name:$name, public_key:$pub}')")" SSH_KEY_ID="$(echo "${RESP}" | jq -r '.data.id // empty')" if [ -z "${SSH_KEY_ID}" ]; then echo "Failed to create Lambda SSH key. Response: ${RESP}" exit 1 fi echo "ssh_key_name=${KEY_NAME}" >> "$GITHUB_OUTPUT" echo "ssh_key_id=${SSH_KEY_ID}" >> "$GITHUB_OUTPUT" echo "ssh_private_key_path=${KEY_PATH}" >> "$GITHUB_OUTPUT" - name: Create ephemeral Lambda firewall ruleset (22 + 8000) id: lambda-fw env: LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} run: | set -euo pipefail REGION="${{ github.event.inputs.region }}" NAME="ylff-gha-fw-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" BODY="$(jq -nc \ --arg name "${NAME}" \ --arg region "${REGION}" \ '{ name: $name, region: $region, rules: [ { protocol: "tcp", port_range: [22,22], source_network: "0.0.0.0/0", description: "SSH" }, { protocol: "tcp", port_range: [8000,8000], source_network: "0.0.0.0/0", description: "YLFF API" } ] }')" RESP="$(curl -sS --fail \ --request POST \ --url "${{ env.LAMBDA_API_BASE }}/firewall-rulesets" \ --header 'accept: application/json' \ --user "${LAMBDA_LABS_KEY}:" \ --data "${BODY}")" FW_ID="$(echo "${RESP}" | jq -r '.data.id // empty')" if [ -z "${FW_ID}" ]; then echo "Failed to create firewall ruleset. Response: ${RESP}" exit 1 fi echo "fw_id=${FW_ID}" >> "$GITHUB_OUTPUT" echo "fw_name=${NAME}" >> "$GITHUB_OUTPUT" - name: Launch Lambda instance id: lambda-launch env: LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} run: | set -euo pipefail REGION="${{ github.event.inputs.region }}" INSTANCE_TYPE="${{ github.event.inputs.instance_type }}" SSH_KEY_NAME="${{ steps.lambda-ssh.outputs.ssh_key_name }}" FW_ID="${{ steps.lambda-fw.outputs.fw_id }}" NAME="ylff-gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" BODY="$(jq -nc \ --arg region "${REGION}" \ --arg it "${INSTANCE_TYPE}" \ --arg name "${NAME}" \ --arg ssh "${SSH_KEY_NAME}" \ --arg fw "${FW_ID}" \ '{ region_name: $region, instance_type_name: $it, ssh_key_names: [$ssh], file_system_names: [], name: $name, firewall_rulesets: [{id: $fw}] }')" RESP="$(curl -sS --fail \ --request POST \ --url "${{ env.LAMBDA_API_BASE }}/instance-operations/launch" \ --header 'accept: application/json' \ --user "${LAMBDA_LABS_KEY}:" \ --data "${BODY}")" INSTANCE_ID="$(echo "${RESP}" | jq -r '.data.instance_ids[0] // empty')" if [ -z "${INSTANCE_ID}" ]; then echo "Failed to launch instance. Response: ${RESP}" exit 1 fi echo "instance_id=${INSTANCE_ID}" >> "$GITHUB_OUTPUT" - name: Wait for Lambda instance to become active + get IP id: lambda-wait run: | set -euo pipefail INSTANCE_ID="${{ steps.lambda-launch.outputs.instance_id }}" python - <<'PY' import os import time import requests base = os.environ["LAMBDA_API_BASE"].rstrip("/") instance_id = os.environ["INSTANCE_ID"] api_key = os.environ["LAMBDA_LABS_KEY"] url = f"{base}/instances/{instance_id}" deadline = time.time() + 20 * 60 ip = None last = None while time.time() < deadline: r = requests.get(url, headers={"accept": "application/json"}, auth=(api_key, "")) if r.status_code >= 400: last = (r.status_code, r.text[:500]) time.sleep(2.0) continue data = (r.json() or {}).get("data") or {} status = data.get("status") ip = data.get("ip") last = {"status": status, "ip": ip} if status == "active" and ip: print(ip) break time.sleep(3.0) # API is rate-limited; keep this gentle. else: raise SystemExit(f"Timed out waiting for instance to become active. last={last!r}") out = os.environ["GITHUB_OUTPUT"] with open(out, "a", encoding="utf-8") as f: f.write(f"instance_ip={ip}\n") PY env: LAMBDA_API_BASE: ${{ env.LAMBDA_API_BASE }} INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }} LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} - name: SSH bootstrap + run container id: lambda-remote env: INSTANCE_IP: ${{ steps.lambda-wait.outputs.instance_ip }} KEY_PATH: ${{ steps.lambda-ssh.outputs.ssh_private_key_path }} FULL_IMAGE: ${{ steps.img.outputs.full_image }} ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} ECR_PASSWORD: ${{ steps.ecrpw.outputs.ecr_password }} SERVER_PORT: ${{ env.SERVER_PORT }} run: | set -euo pipefail # Wait for SSH to accept connections for i in {1..60}; do if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \ -i "${KEY_PATH}" ubuntu@"${INSTANCE_IP}" "echo ok" >/dev/null 2>&1; then break fi sleep 5 done # Run remote bootstrap + start API # # NOTE: We pass ECR credentials and image as inline env vars for the remote shell # (Lambda instance won't have these set). ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ -i "${KEY_PATH}" ubuntu@"${INSTANCE_IP}" \ "ECR_PASSWORD='${ECR_PASSWORD}' ECR_REGISTRY='${ECR_REGISTRY}' FULL_IMAGE='${FULL_IMAGE}' SERVER_PORT='${SERVER_PORT}' bash -lc $(printf %q "$(cat <<'BASH' set -euo pipefail echo "Checking docker..." if ! command -v docker >/dev/null 2>&1; then echo "docker not found; installing" sudo apt-get update -y sudo apt-get install -y docker.io fi sudo systemctl enable --now docker || true # ECR login (runner provides short-lived password) echo "${ECR_PASSWORD}" | sudo docker login --username AWS --password-stdin "${ECR_REGISTRY}" # Pull and run image (explicit uvicorn command for consistency with RunPod template) sudo docker pull "${FULL_IMAGE}" sudo docker rm -f ylff || true # Provide a stable cache volume similar to RunPod's /workspace. sudo mkdir -p /workspace/.cache sudo docker run -d --restart=unless-stopped \ --gpus all \ --name ylff \ -p ${SERVER_PORT}:8000 \ -v /workspace:/workspace \ -e PYTHONUNBUFFERED=1 \ -e PYTHONPATH=/app \ -e XDG_CACHE_HOME=/workspace/.cache \ -e HF_HOME=/workspace/.cache/huggingface \ -e HUGGINGFACE_HUB_CACHE=/workspace/.cache/huggingface/hub \ -e TRANSFORMERS_CACHE=/workspace/.cache/huggingface/transformers \ -e TORCH_HOME=/workspace/.cache/torch \ "${FULL_IMAGE}" \ python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000 --log-level info --access-log echo "Container started. Recent logs:" sudo docker logs --tail 50 ylff || true BASH )")" - name: Wait for API health env: BASE_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/ HEALTH_TIMEOUT_S: ${{ github.event.inputs.health_timeout_s }} run: | set -e python - <<'PY' import os import time import requests from urllib.parse import urljoin base = os.environ["BASE_URL"].rstrip("/") + "/" timeout_s = int((os.environ.get("HEALTH_TIMEOUT_S") or "2400").strip()) url = urljoin(base, "health") start = time.time() last = None print(f"Polling {url} (timeout={timeout_s}s) ...", flush=True) while True: elapsed = int(time.time() - start) try: r = requests.get(url, timeout=10) last = (r.status_code, (r.text or "")[:300]) if r.status_code == 200: print("API is healthy.", flush=True) raise SystemExit(0) except Exception as e: last = ("error", repr(e)) if elapsed >= timeout_s: break time.sleep(5) raise SystemExit(f"Timed out waiting for /health. last={last!r}") PY - name: Run remote smoke pytest env: RUNPOD_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/ YLFF_SMOKE_DEVICE: "cuda" YLFF_SMOKE_MODEL: ${{ env.SMOKE_MODEL }} YLFF_SMOKE_TIMEOUT_S: ${{ github.event.inputs.timeout_s }} # Lambda GPU names vary by region/capacity; don't assert a strict substring by default. YLFF_EXPECT_GPU_SUBSTR: "" YLFF_RUN_INFERENCE_PIPELINE_SMOKE: "1" YLFF_SMOKE_PIPELINE_SAMPLE: "arkitscenes_40753679_clip" run: | pytest -q \ tests/test_remote_runpod_smoke.py \ tests/test_remote_runpod_train_smoke.py - name: Lambda smoke summary if: always() env: BASE_URL: http://${{ steps.lambda-wait.outputs.instance_ip }}:${{ env.SERVER_PORT }}/ FULL_IMAGE: ${{ steps.img.outputs.full_image }} REGION: ${{ github.event.inputs.region }} INSTANCE_TYPE: ${{ github.event.inputs.instance_type }} INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }} run: | { echo "## Lambda GPU Smoke Summary" echo "" echo "- **Instance ID**: \`${INSTANCE_ID}\`" echo "- **Region**: \`${REGION}\`" echo "- **Instance type**: \`${INSTANCE_TYPE}\`" echo "- **Base URL**: ${BASE_URL}" echo "- **Docker image**: \`${FULL_IMAGE}\`" echo "" echo "- **Lambda Cloud API docs**: https://docs-api.lambda.ai/api/cloud" echo "" } >> "$GITHUB_STEP_SUMMARY" - name: Cleanup (terminate instance + delete firewall ruleset + delete SSH key) if: always() env: LAMBDA_LABS_KEY: ${{ secrets.LAMBDA_LABS_KEY }} INSTANCE_ID: ${{ steps.lambda-launch.outputs.instance_id }} FW_ID: ${{ steps.lambda-fw.outputs.fw_id }} SSH_KEY_ID: ${{ steps.lambda-ssh.outputs.ssh_key_id }} run: | set +euo pipefail if [ -n "${INSTANCE_ID}" ]; then curl -sS --fail \ --request POST \ --url "${{ env.LAMBDA_API_BASE }}/instance-operations/terminate" \ --header 'accept: application/json' \ --user "${LAMBDA_LABS_KEY}:" \ --data "$(jq -nc --arg id "${INSTANCE_ID}" '{instance_ids: [$id]}')" \ || true fi if [ -n "${FW_ID}" ]; then curl -sS --fail \ --request DELETE \ --url "${{ env.LAMBDA_API_BASE }}/firewall-rulesets/${FW_ID}" \ --header 'accept: application/json' \ --user "${LAMBDA_LABS_KEY}:" \ || true fi if [ -n "${SSH_KEY_ID}" ]; then curl -sS --fail \ --request DELETE \ --url "${{ env.LAMBDA_API_BASE }}/ssh-keys/${SSH_KEY_ID}" \ --header 'accept: application/json' \ --user "${LAMBDA_LABS_KEY}:" \ || true fi