3d_model / .github /workflows /deploy-runpod.yml
Azan
Clean deployment build (Squashed)
7a87926
name: Deploy to RunPod
on:
workflow_run:
workflows: ["RunPod H100x1 Smoke Test"]
types:
- completed
branches:
- main
- dev
workflow_dispatch:
inputs:
image_tag:
description: "Docker image tag to deploy"
required: false
default: "auto"
gpu_type:
description: "RunPod GPU type (e.g. NVIDIA RTX A6000, NVIDIA H100 PCIe)"
required: false
default: "NVIDIA RTX A6000"
gpu_count:
description: "GPU count"
required: false
default: "1"
env:
AWS_REGION: us-east-1
ECR_REPOSITORY: ylff
RUNPOD_TEMPLATE_NAME: "YLFF-Dev-Template"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
id-token: write
jobs:
deploy:
runs-on: ubuntu-latest
if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Cache pip packages
uses: actions/cache@v4
with:
path: ~/.cache/pip
key: ${{ runner.os }}-pip-runpod-${{ hashFiles('**/requirements*.txt') }}
restore-keys: |
${{ runner.os }}-pip-runpod-
- name: Install RunPod CLI
run: |
set -e
echo "Installing runpodctl from GitHub releases..."
# Get the latest version from GitHub API
LATEST_VERSION=$(curl -s https://api.github.com/repos/Run-Pod/runpodctl/releases/latest | jq -r '.tag_name')
if [ -z "$LATEST_VERSION" ] || [ "$LATEST_VERSION" = "null" ]; then
echo "Failed to get latest version, using fallback version v1.14.3"
LATEST_VERSION="v1.14.3"
fi
echo "Installing runpodctl version: $LATEST_VERSION"
# Download and install runpodctl
wget --quiet --show-progress \
"https://github.com/Run-Pod/runpodctl/releases/download/${LATEST_VERSION}/runpodctl-linux-amd64" \
-O runpodctl
# Make it executable and move to system path
chmod +x runpodctl
sudo mv runpodctl /usr/local/bin/runpodctl
# Verify installation
echo "Verifying runpodctl installation..."
runpodctl version
echo "runpodctl installed successfully"
- name: Configure RunPod
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
run: |
echo "Configuring runpodctl with API key..."
# Try using the config command first
if runpodctl config --apiKey "${{ secrets.RUNPOD_API_KEY }}"; then
echo "runpodctl configured successfully using config command"
else
echo "Config command failed, using manual YAML configuration..."
# Fallback to manual YAML configuration
mkdir -p ~/.runpod
echo "apiKey: ${{ secrets.RUNPOD_API_KEY }}" > ~/.runpod/.runpod.yaml
chmod 600 ~/.runpod/.runpod.yaml
echo "Manual YAML configuration completed"
fi
# Verify configuration
echo "Testing runpodctl configuration..."
if runpodctl get pod --help > /dev/null 2>&1; then
echo "runpodctl configuration verified successfully"
else
echo "Warning: runpodctl configuration verification failed, but continuing..."
fi
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::211125621822:role/github-actions-role
aws-region: ${{ env.AWS_REGION }}
role-session-name: GitHubActionsSession
output-credentials: true
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Determine image tag
id: image-tag
run: |
set -euo pipefail
if [ "${{ github.event_name }}" = "workflow_run" ]; then
IMAGE_TAG="auto"
BRANCH="${{ github.event.workflow_run.head_branch }}"
SHORT_SHA="$(echo "${{ github.event.workflow_run.head_sha }}" | cut -c1-7)"
else
IMAGE_TAG="${{ github.event.inputs.image_tag }}"
BRANCH="${GITHUB_REF_NAME}"
SHORT_SHA="${GITHUB_SHA::7}"
fi
if [ -z "${IMAGE_TAG}" ]; then
IMAGE_TAG="auto"
fi
CANDIDATE_TAG="${BRANCH}-${SHORT_SHA}"
if [ "${IMAGE_TAG}" = "latest" ] || [ "${IMAGE_TAG}" = "auto" ]; then
if aws ecr describe-images \
--repository-name "${{ env.ECR_REPOSITORY }}" \
--image-ids "imageTag=${CANDIDATE_TAG}" \
--region "${{ env.AWS_REGION }}" >/dev/null 2>&1; then
echo "Using immutable ECR tag: ${CANDIDATE_TAG}"
IMAGE_TAG="${CANDIDATE_TAG}"
else
if [ "${IMAGE_TAG}" = "auto" ]; then
IMAGE_TAG="latest"
fi
echo "Immutable tag not found (${CANDIDATE_TAG}); using tag: ${IMAGE_TAG}"
fi
fi
# Use ECR image path
FULL_IMAGE="${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${IMAGE_TAG}"
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT
echo "full_image=${FULL_IMAGE}" >> $GITHUB_OUTPUT
echo "Branch: ${BRANCH:-unknown}"
echo "Using image: ${FULL_IMAGE}"
- name: Verify image exists in ECR
run: |
FULL_IMAGE="${{ steps.image-tag.outputs.full_image }}"
IMAGE_TAG="${{ steps.image-tag.outputs.image_tag }}"
echo "πŸ” Verifying image exists in ECR..."
echo "Checking for: ${FULL_IMAGE}"
# Try to describe the image in ECR
if aws ecr describe-images \
--repository-name ${{ env.ECR_REPOSITORY }} \
--image-ids imageTag=${IMAGE_TAG} \
--region ${{ env.AWS_REGION }} 2>/dev/null; then
echo "βœ… Image found in ECR with tag: ${IMAGE_TAG}"
else
echo "❌ Image not found with tag: ${IMAGE_TAG}"
echo "πŸ” Checking available tags..."
# List available tags
AVAILABLE_TAGS=$(aws ecr describe-images \
--repository-name ${{ env.ECR_REPOSITORY }} \
--region ${{ env.AWS_REGION }} \
--query 'imageDetails[*].imageTags[*]' \
--output text 2>/dev/null || echo "")
if [ -n "$AVAILABLE_TAGS" ]; then
echo "Available tags in ECR:"
echo "$AVAILABLE_TAGS"
else
echo "No tags found in ECR repository"
fi
echo "⚠️ Continuing anyway - image may be available or will be created"
fi
- name: Get ECR credentials for RunPod
id: ecr-credentials
run: |
echo "πŸ” Getting ECR credentials for RunPod authentication..."
ECR_CREDENTIALS=$(aws ecr get-login-password --region ${{ env.AWS_REGION }})
echo "ecr_credentials=${ECR_CREDENTIALS}" >> $GITHUB_OUTPUT
echo "ecr_registry=${{ steps.login-ecr.outputs.registry }}" >> $GITHUB_OUTPUT
echo "βœ… ECR credentials retrieved"
- name: Stop and Remove Existing Pod
id: stop-pod
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
STABLE_POD_NAME: "ylff-dev-stable"
run: |
echo "πŸ” Checking for existing pod: $STABLE_POD_NAME"
ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "")
if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then
EXISTING_POD_ID=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME" | awk '{print $1}')
echo "Found existing pod: $EXISTING_POD_ID"
echo "pod_id=${EXISTING_POD_ID}" >> $GITHUB_OUTPUT
# Stop the pod first
echo "Stopping pod..."
runpodctl stop pod "$EXISTING_POD_ID" || true
sleep 20
# Remove the pod
echo "Removing pod..."
runpodctl remove pod "$EXISTING_POD_ID" || true
sleep 20
# Verify pod is fully removed before proceeding
echo "Verifying pod removal..."
for verify_attempt in {1..10}; do
ALL_PODS_CHECK=$(runpodctl get pod --allfields 2>/dev/null || echo "")
if ! echo "$ALL_PODS_CHECK" | grep -q "$STABLE_POD_NAME"; then
echo "βœ… Pod fully removed"
break
else
echo "Pod still exists (attempt $verify_attempt/10), waiting..."
sleep 10
fi
done
echo "βœ… Proceeding with template and auth cleanup"
else
echo "No existing pod found"
echo "pod_id=" >> $GITHUB_OUTPUT
fi
- name: Create or Update RunPod Template
id: create-template
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
FULL_IMAGE: ${{ steps.image-tag.outputs.full_image }}
ECR_CREDENTIALS: ${{ steps.ecr-credentials.outputs.ecr_credentials }}
ECR_REGISTRY: ${{ steps.ecr-credentials.outputs.ecr_registry }}
run: |
TEMPLATE_NAME="${{ env.RUNPOD_TEMPLATE_NAME }}"
# Get existing templates
TEMPLATES_RESPONSE=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data '{"query":"query { myself { podTemplates { id name } } }"}')
EXISTING_TEMPLATE_ID=$(echo "$TEMPLATES_RESPONSE" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "")
TIMESTAMP=$(date +%s)
if [ -n "$EXISTING_TEMPLATE_ID" ] && [ "$EXISTING_TEMPLATE_ID" != "null" ]; then
echo "Found existing template: $EXISTING_TEMPLATE_ID"
echo "Deleting old template..."
# Try to delete the template (multiple attempts with delays)
for attempt in {1..3}; do
DELETE_RESPONSE=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data "{\"query\":\"mutation { deleteTemplate(templateId: \\\"$EXISTING_TEMPLATE_ID\\\") }\"}")
sleep 5
# Verify template was deleted
VERIFY_RESPONSE=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data '{"query":"query { myself { podTemplates { id name } } }"}')
STILL_EXISTS=$(echo "$VERIFY_RESPONSE" | jq -r ".data.myself.podTemplates[] | select(.id == \"$EXISTING_TEMPLATE_ID\") | .id" 2>/dev/null || echo "")
if [ -z "$STILL_EXISTS" ]; then
echo "βœ… Template deleted successfully"
break
else
echo "⚠️ Template still exists (attempt $attempt/3), waiting longer..."
sleep 10
fi
done
# If still exists after all attempts, use timestamp suffix
FINAL_CHECK=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data '{"query":"query { myself { podTemplates { id name } } }"}')
STILL_EXISTS_FINAL=$(echo "$FINAL_CHECK" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "")
if [ -n "$STILL_EXISTS_FINAL" ]; then
echo "⚠️ Template with name '$TEMPLATE_NAME' still exists, using timestamp suffix"
TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}"
echo "New template name: $TEMPLATE_NAME"
fi
fi
# Create or update ECR authentication in RunPod
AUTH_NAME="ecr-auth-ylff"
AUTH_ID=""
# Function to verify auth exists
verify_auth_exists() {
local auth_id_to_check="$1"
if [ -z "$auth_id_to_check" ] || [ "$auth_id_to_check" = "null" ]; then
return 1
fi
VERIFY_AUTHS=$(curl -s --request GET \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer ${RUNPOD_API_KEY}" \
--url "https://rest.runpod.io/v1/containerregistryauth")
VERIFY_ID=$(echo "$VERIFY_AUTHS" | jq -r ".[] | select(.id == \"$auth_id_to_check\") | .id" 2>/dev/null || echo "")
[ -n "$VERIFY_ID" ] && [ "$VERIFY_ID" != "null" ]
}
# Check if auth already exists
EXISTING_AUTHS=$(curl -s --request GET \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer ${RUNPOD_API_KEY}" \
--url "https://rest.runpod.io/v1/containerregistryauth")
EXISTING_AUTH_ID=$(echo "$EXISTING_AUTHS" | jq -r ".[] | select(.name == \"$AUTH_NAME\") | .id" 2>/dev/null || echo "")
if [ -n "$EXISTING_AUTH_ID" ] && [ "$EXISTING_AUTH_ID" != "null" ]; then
echo "Found existing ECR auth: $EXISTING_AUTH_ID"
# Verify it actually exists before trying to delete
if verify_auth_exists "$EXISTING_AUTH_ID"; then
echo "Verifying auth exists before deletion..."
# Try to delete it, but handle errors gracefully
DELETE_AUTH_HTTP_CODE=$(curl -s -o /tmp/auth_delete_response.txt -w "%{http_code}" --request DELETE \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer ${RUNPOD_API_KEY}" \
--url "https://rest.runpod.io/v1/containerregistryauth/$EXISTING_AUTH_ID")
DELETE_AUTH_RESPONSE=$(cat /tmp/auth_delete_response.txt 2>/dev/null || echo "")
# Check if deletion succeeded (204/200 are success codes)
if [ "$DELETE_AUTH_HTTP_CODE" = "204" ] || [ "$DELETE_AUTH_HTTP_CODE" = "200" ]; then
echo "βœ… ECR auth deleted successfully (HTTP $DELETE_AUTH_HTTP_CODE)"
# Save auth ID for verification before clearing EXISTING_AUTH_ID
DELETED_AUTH_ID="$EXISTING_AUTH_ID"
# Clear EXISTING_AUTH_ID immediately since deletion succeeded
# This ensures we create a new auth instead of reusing the deleted one
EXISTING_AUTH_ID=""
# Wait and verify deletion (for informational/logging purposes)
sleep 3
for verify_attempt in {1..5}; do
if ! verify_auth_exists "$DELETED_AUTH_ID"; then
echo "βœ… Auth deletion verified (attempt $verify_attempt)"
break
else
echo "⚠️ Auth still exists (attempt $verify_attempt/5), waiting..."
sleep 2
fi
done
elif echo "$DELETE_AUTH_RESPONSE" | grep -qi "in use\|error\|failed"; then
echo "⚠️ ECR auth deletion failed (HTTP $DELETE_AUTH_HTTP_CODE)"
echo "Response: $DELETE_AUTH_RESPONSE"
echo "Auth may be in use. Will create new auth with timestamp suffix"
AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}"
EXISTING_AUTH_ID=""
else
echo "⚠️ ECR auth deletion returned unexpected status (HTTP $DELETE_AUTH_HTTP_CODE)"
echo "Response: $DELETE_AUTH_RESPONSE"
echo "Will create new auth with timestamp suffix"
AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}"
EXISTING_AUTH_ID=""
fi
else
echo "⚠️ Existing auth ID found but doesn't exist in RunPod, will create new one"
EXISTING_AUTH_ID=""
fi
fi
# Create new ECR auth (always create fresh to avoid stale references)
echo "Creating new ECR auth: $AUTH_NAME"
AUTH_RESPONSE=$(curl -s --request POST \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer ${RUNPOD_API_KEY}" \
--url "https://rest.runpod.io/v1/containerregistryauth" \
--data "{
\"name\": \"$AUTH_NAME\",
\"username\": \"AWS\",
\"password\": \"${ECR_CREDENTIALS}\"
}")
AUTH_ID=$(echo "$AUTH_RESPONSE" | jq -r '.id' 2>/dev/null || echo "")
if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then
ERROR_MSG=$(echo "$AUTH_RESPONSE" | jq -r '.message // .error // "Unknown error"' 2>/dev/null || echo "")
echo "❌ Failed to create ECR auth"
echo "Response: $AUTH_RESPONSE"
echo "Error: $ERROR_MSG"
# Try with timestamp suffix as fallback
AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}"
echo "Retrying with name: $AUTH_NAME"
AUTH_RESPONSE=$(curl -s --request POST \
--header 'Content-Type: application/json' \
--header "Authorization: Bearer ${RUNPOD_API_KEY}" \
--url "https://rest.runpod.io/v1/containerregistryauth" \
--data "{
\"name\": \"$AUTH_NAME\",
\"username\": \"AWS\",
\"password\": \"${ECR_CREDENTIALS}\"
}")
AUTH_ID=$(echo "$AUTH_RESPONSE" | jq -r '.id' 2>/dev/null || echo "")
if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then
echo "❌ Failed to create ECR auth even with timestamp suffix"
echo "Response: $AUTH_RESPONSE"
exit 1
fi
fi
# Verify the auth was created and exists
echo "Verifying created ECR auth: $AUTH_ID"
sleep 2
if verify_auth_exists "$AUTH_ID"; then
echo "βœ… ECR authentication verified: $AUTH_ID"
else
echo "⚠️ ECR auth created but verification failed, waiting longer..."
sleep 5
if verify_auth_exists "$AUTH_ID"; then
echo "βœ… ECR authentication verified after wait: $AUTH_ID"
else
echo "❌ ECR auth verification failed after retry"
echo "This may cause template creation to fail"
fi
fi
# Final check: ensure template name is available before creating
FINAL_TEMPLATES_CHECK=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data '{"query":"query { myself { podTemplates { id name } } }"}')
NAME_EXISTS=$(echo "$FINAL_TEMPLATES_CHECK" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "")
if [ -n "$NAME_EXISTS" ] && [ "$NAME_EXISTS" != "null" ]; then
echo "⚠️ Template name '$TEMPLATE_NAME' still exists, using timestamp suffix"
TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}"
echo "Using new template name: $TEMPLATE_NAME"
fi
# Validate AUTH_ID before creating template
if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then
echo "❌ Cannot create template: ECR auth ID is missing"
exit 1
fi
# Verify auth still exists before using it
if ! verify_auth_exists "$AUTH_ID"; then
echo "❌ Cannot create template: ECR auth ID $AUTH_ID does not exist"
echo "This may indicate a timing issue. Please retry the deployment."
exit 1
fi
# Create new template with ECR auth
echo "Creating template: $TEMPLATE_NAME"
echo "Using ECR auth ID: $AUTH_ID"
echo "Using image: ${FULL_IMAGE}"
CREATE_RESPONSE=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" }, { key: \\\"XDG_CACHE_HOME\\\", value: \\\"/workspace/.cache\\\" }, { key: \\\"HF_HOME\\\", value: \\\"/workspace/.cache/huggingface\\\" }, { key: \\\"HUGGINGFACE_HUB_CACHE\\\", value: \\\"/workspace/.cache/huggingface/hub\\\" }, { key: \\\"TRANSFORMERS_CACHE\\\", value: \\\"/workspace/.cache/huggingface/transformers\\\" }, { key: \\\"TORCH_HOME\\\", value: \\\"/workspace/.cache/torch\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}")
TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "")
ERROR_MSG=$(echo "$CREATE_RESPONSE" | jq -r '.errors[0].message' 2>/dev/null || echo "")
ERROR_PATH=$(echo "$CREATE_RESPONSE" | jq -r '.errors[0].path[0]' 2>/dev/null || echo "")
if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then
echo "❌ Failed to create template"
echo "Response: $CREATE_RESPONSE"
if [ -n "$ERROR_MSG" ]; then
echo "Error message: $ERROR_MSG"
echo "Error path: $ERROR_PATH"
# Handle specific error cases
if echo "$ERROR_MSG" | grep -qi "Registry Auth not found\|containerRegistryAuthId"; then
echo "❌ ECR auth ID $AUTH_ID not found in RunPod"
echo "Attempting to verify auth existence..."
if verify_auth_exists "$AUTH_ID"; then
echo "⚠️ Auth exists but template creation failed. This may be a RunPod API issue."
echo "Retrying template creation after delay..."
sleep 5
# Retry once
CREATE_RESPONSE=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}")
TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "")
if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then
echo "❌ Retry also failed"
exit 1
fi
else
echo "❌ Auth does not exist. Cannot create template."
exit 1
fi
elif echo "$ERROR_MSG" | grep -qi "unique\|already exists"; then
echo "⚠️ Template name already exists, trying with timestamp suffix"
TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}"
# Try again with timestamp
CREATE_RESPONSE=$(curl -s --request POST \
--header 'content-type: application/json' \
--url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \
--data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}")
TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "")
if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then
echo "❌ Failed to create template even with timestamp suffix"
echo "Response: $CREATE_RESPONSE"
exit 1
fi
else
exit 1
fi
else
exit 1
fi
fi
echo "template_id=$TEMPLATE_ID" >> $GITHUB_OUTPUT
echo "template_name=$TEMPLATE_NAME" >> $GITHUB_OUTPUT
echo "βœ… Template created/updated: $TEMPLATE_ID (name: $TEMPLATE_NAME)"
- name: Deploy to RunPod - Create or Update Pod
id: deploy-pod
env:
RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }}
FULL_IMAGE: ${{ steps.image-tag.outputs.full_image }}
STABLE_POD_NAME: "ylff-dev-stable"
run: |
set -euo pipefail
# Check if pod already exists
EXISTING_POD_ID=""
ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "")
if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then
EXISTING_POD_ID=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME" | awk '{print $1}')
echo "Found existing pod: $EXISTING_POD_ID"
# Stop and remove the pod
echo "Stopping existing pod for update..."
runpodctl stop pod "$EXISTING_POD_ID" || true
sleep 10
echo "Removing old pod to deploy new version..."
runpodctl remove pod "$EXISTING_POD_ID" || true
sleep 15
else
echo "No existing pod found, will create new one"
fi
sleep 10
# Create the pod
echo "Creating pod: $STABLE_POD_NAME"
echo "Using image: $FULL_IMAGE"
echo "Using template: ${{ steps.create-template.outputs.template_id }}"
runpodctl create pod \
--name="$STABLE_POD_NAME" \
--imageName="$FULL_IMAGE" \
--templateId="${{ steps.create-template.outputs.template_id }}" \
--gpuType="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_type || 'NVIDIA RTX A6000' }}" \
--gpuCount="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_count || '1' }}" \
--secureCloud \
--containerDiskSize=20 \
--mem=32 \
--vcpu=4
if [ $? -ne 0 ]; then
echo "Failed to create pod, retrying once..."
sleep 10
runpodctl create pod \
--name="$STABLE_POD_NAME" \
--imageName="$FULL_IMAGE" \
--templateId="${{ steps.create-template.outputs.template_id }}" \
--gpuType="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_type || 'NVIDIA RTX A6000' }}" \
--gpuCount="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_count || '1' }}" \
--secureCloud \
--containerDiskSize=20 \
--mem=32 \
--vcpu=4
if [ $? -ne 0 ]; then
exit 1
fi
fi
# Wait for pod to initialize
echo "Waiting for pod to initialize..."
sleep 30
# Get pod details
ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "")
if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then
POD_LINE=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME")
POD_ID=$(echo "$POD_LINE" | awk '{print $1}')
POD_STATUS=$(echo "$POD_LINE" | awk '{print $7}')
POD_URL="https://${POD_ID}-8000.proxy.runpod.net"
echo "βœ… Pod created successfully!"
echo " Pod Name: $STABLE_POD_NAME"
echo " Pod ID: $POD_ID"
echo " Status: $POD_STATUS"
echo " Backend URL: $POD_URL"
# Save pod details for summary
echo "pod_id=${POD_ID}" >> $GITHUB_OUTPUT
echo "pod_url=${POD_URL}" >> $GITHUB_OUTPUT
echo "pod_status=${POD_STATUS}" >> $GITHUB_OUTPUT
else
echo "⚠️ Pod created but details not available yet"
fi
- name: Wait for deployed API health
if: always()
env:
POD_URL: ${{ steps.deploy-pod.outputs.pod_url }}
run: |
set -e
if [ -z "${POD_URL:-}" ]; then
echo "No pod_url available; skipping health check."
exit 0
fi
URL="${POD_URL%/}/health"
echo "Polling ${URL} ..."
deadline=$(( $(date +%s) + 20*60 ))
last=""
while [ "$(date +%s)" -lt "$deadline" ]; do
# -sS: quiet but show errors, -m: max time, -o /dev/null: no body, -w: print status
code="$(curl -sS -m 10 -o /dev/null -w "%{http_code}" "${URL}" || true)"
last="$code"
if [ "$code" = "200" ]; then
echo "Deployed API is healthy."
exit 0
fi
sleep 10
done
echo "Timed out waiting for deployed /health: last_status=${last}"
exit 1
- name: Add deployment summary
if: always()
run: |
POD_ID="${{ steps.deploy-pod.outputs.pod_id }}"
POD_URL="${{ steps.deploy-pod.outputs.pod_url }}"
POD_STATUS="${{ steps.deploy-pod.outputs.pod_status }}"
TEMPLATE_NAME="${{ steps.create-template.outputs.template_name }}"
FULL_IMAGE="${{ steps.image-tag.outputs.full_image }}"
{
echo "## πŸš€ YLFF Deployment Summary"
echo ""
echo "### Pod Information"
if [ -n "$POD_ID" ]; then
echo "- **Pod Name:** ylff-dev-stable"
echo "- **Pod ID:** \`$POD_ID\`"
echo "- **Status:** $POD_STATUS"
echo ""
echo "### πŸ”— Connection URLs"
echo "- **API Server:** [$POD_URL]($POD_URL)"
echo "- **API Docs:** [$POD_URL/docs]($POD_URL/docs)"
echo "- **Health Check:** [$POD_URL/health]($POD_URL/health)"
echo ""
else
echo "⚠️ Pod details not available"
echo ""
fi
echo "### πŸ“¦ Deployment Details"
echo "- **Docker Image:** \`$FULL_IMAGE\`"
echo "- **Template:** $TEMPLATE_NAME"
echo "- **Template ID:** \`${{ steps.create-template.outputs.template_id }}\`"
echo ""
echo "### πŸ“š API Endpoints"
echo "- \`GET /\` - API information"
echo "- \`GET /health\` - Health check"
echo "- \`GET /models\` - List available models"
echo "- \`POST /api/v1/validate/sequence\` - Validate sequence"
echo "- \`POST /api/v1/validate/arkit\` - Validate ARKit data"
echo "- \`POST /api/v1/dataset/build\` - Build training dataset"
echo "- \`POST /api/v1/train/start\` - Fine-tune model"
echo "- \`POST /api/v1/train/pretrain\` - Pre-train on ARKit"
echo "- \`POST /api/v1/eval/ba-agreement\` - Evaluate BA agreement"
echo "- \`POST /api/v1/visualize\` - Visualize results"
echo "- \`GET /api/v1/jobs\` - List all jobs"
echo "- \`GET /api/v1/jobs/{job_id}\` - Get job status"
} >> $GITHUB_STEP_SUMMARY