name: Deploy to RunPod on: workflow_run: workflows: ["RunPod H100x1 Smoke Test"] types: - completed branches: - main - dev workflow_dispatch: inputs: image_tag: description: "Docker image tag to deploy" required: false default: "auto" gpu_type: description: "RunPod GPU type (e.g. NVIDIA RTX A6000, NVIDIA H100 PCIe)" required: false default: "NVIDIA RTX A6000" gpu_count: description: "GPU count" required: false default: "1" env: AWS_REGION: us-east-1 ECR_REPOSITORY: ylff RUNPOD_TEMPLATE_NAME: "YLFF-Dev-Template" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: contents: read id-token: write jobs: deploy: runs-on: ubuntu-latest if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} steps: - name: Checkout repository uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Cache pip packages uses: actions/cache@v4 with: path: ~/.cache/pip key: ${{ runner.os }}-pip-runpod-${{ hashFiles('**/requirements*.txt') }} restore-keys: | ${{ runner.os }}-pip-runpod- - name: Install RunPod CLI run: | set -e echo "Installing runpodctl from GitHub releases..." # Get the latest version from GitHub API LATEST_VERSION=$(curl -s https://api.github.com/repos/Run-Pod/runpodctl/releases/latest | jq -r '.tag_name') if [ -z "$LATEST_VERSION" ] || [ "$LATEST_VERSION" = "null" ]; then echo "Failed to get latest version, using fallback version v1.14.3" LATEST_VERSION="v1.14.3" fi echo "Installing runpodctl version: $LATEST_VERSION" # Download and install runpodctl wget --quiet --show-progress \ "https://github.com/Run-Pod/runpodctl/releases/download/${LATEST_VERSION}/runpodctl-linux-amd64" \ -O runpodctl # Make it executable and move to system path chmod +x runpodctl sudo mv runpodctl /usr/local/bin/runpodctl # Verify installation echo "Verifying runpodctl installation..." runpodctl version echo "runpodctl installed successfully" - name: Configure RunPod env: RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} run: | echo "Configuring runpodctl with API key..." # Try using the config command first if runpodctl config --apiKey "${{ secrets.RUNPOD_API_KEY }}"; then echo "runpodctl configured successfully using config command" else echo "Config command failed, using manual YAML configuration..." # Fallback to manual YAML configuration mkdir -p ~/.runpod echo "apiKey: ${{ secrets.RUNPOD_API_KEY }}" > ~/.runpod/.runpod.yaml chmod 600 ~/.runpod/.runpod.yaml echo "Manual YAML configuration completed" fi # Verify configuration echo "Testing runpodctl configuration..." if runpodctl get pod --help > /dev/null 2>&1; then echo "runpodctl configuration verified successfully" else echo "Warning: runpodctl configuration verification failed, but continuing..." fi - name: Configure AWS credentials uses: aws-actions/configure-aws-credentials@v4 with: role-to-assume: arn:aws:iam::211125621822:role/github-actions-role aws-region: ${{ env.AWS_REGION }} role-session-name: GitHubActionsSession output-credentials: true - name: Login to Amazon ECR id: login-ecr uses: aws-actions/amazon-ecr-login@v2 - name: Determine image tag id: image-tag run: | set -euo pipefail if [ "${{ github.event_name }}" = "workflow_run" ]; then IMAGE_TAG="auto" BRANCH="${{ github.event.workflow_run.head_branch }}" SHORT_SHA="$(echo "${{ github.event.workflow_run.head_sha }}" | cut -c1-7)" else IMAGE_TAG="${{ github.event.inputs.image_tag }}" BRANCH="${GITHUB_REF_NAME}" SHORT_SHA="${GITHUB_SHA::7}" fi if [ -z "${IMAGE_TAG}" ]; then IMAGE_TAG="auto" fi CANDIDATE_TAG="${BRANCH}-${SHORT_SHA}" if [ "${IMAGE_TAG}" = "latest" ] || [ "${IMAGE_TAG}" = "auto" ]; then if aws ecr describe-images \ --repository-name "${{ env.ECR_REPOSITORY }}" \ --image-ids "imageTag=${CANDIDATE_TAG}" \ --region "${{ env.AWS_REGION }}" >/dev/null 2>&1; then echo "Using immutable ECR tag: ${CANDIDATE_TAG}" IMAGE_TAG="${CANDIDATE_TAG}" else if [ "${IMAGE_TAG}" = "auto" ]; then IMAGE_TAG="latest" fi echo "Immutable tag not found (${CANDIDATE_TAG}); using tag: ${IMAGE_TAG}" fi fi # Use ECR image path FULL_IMAGE="${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${IMAGE_TAG}" echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT echo "full_image=${FULL_IMAGE}" >> $GITHUB_OUTPUT echo "Branch: ${BRANCH:-unknown}" echo "Using image: ${FULL_IMAGE}" - name: Verify image exists in ECR run: | FULL_IMAGE="${{ steps.image-tag.outputs.full_image }}" IMAGE_TAG="${{ steps.image-tag.outputs.image_tag }}" echo "🔍 Verifying image exists in ECR..." echo "Checking for: ${FULL_IMAGE}" # Try to describe the image in ECR if aws ecr describe-images \ --repository-name ${{ env.ECR_REPOSITORY }} \ --image-ids imageTag=${IMAGE_TAG} \ --region ${{ env.AWS_REGION }} 2>/dev/null; then echo "✅ Image found in ECR with tag: ${IMAGE_TAG}" else echo "❌ Image not found with tag: ${IMAGE_TAG}" echo "🔍 Checking available tags..." # List available tags AVAILABLE_TAGS=$(aws ecr describe-images \ --repository-name ${{ env.ECR_REPOSITORY }} \ --region ${{ env.AWS_REGION }} \ --query 'imageDetails[*].imageTags[*]' \ --output text 2>/dev/null || echo "") if [ -n "$AVAILABLE_TAGS" ]; then echo "Available tags in ECR:" echo "$AVAILABLE_TAGS" else echo "No tags found in ECR repository" fi echo "⚠️ Continuing anyway - image may be available or will be created" fi - name: Get ECR credentials for RunPod id: ecr-credentials run: | echo "🔐 Getting ECR credentials for RunPod authentication..." ECR_CREDENTIALS=$(aws ecr get-login-password --region ${{ env.AWS_REGION }}) echo "ecr_credentials=${ECR_CREDENTIALS}" >> $GITHUB_OUTPUT echo "ecr_registry=${{ steps.login-ecr.outputs.registry }}" >> $GITHUB_OUTPUT echo "✅ ECR credentials retrieved" - name: Stop and Remove Existing Pod id: stop-pod env: RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} STABLE_POD_NAME: "ylff-dev-stable" run: | echo "🔍 Checking for existing pod: $STABLE_POD_NAME" ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "") if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then EXISTING_POD_ID=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME" | awk '{print $1}') echo "Found existing pod: $EXISTING_POD_ID" echo "pod_id=${EXISTING_POD_ID}" >> $GITHUB_OUTPUT # Stop the pod first echo "Stopping pod..." runpodctl stop pod "$EXISTING_POD_ID" || true sleep 20 # Remove the pod echo "Removing pod..." runpodctl remove pod "$EXISTING_POD_ID" || true sleep 20 # Verify pod is fully removed before proceeding echo "Verifying pod removal..." for verify_attempt in {1..10}; do ALL_PODS_CHECK=$(runpodctl get pod --allfields 2>/dev/null || echo "") if ! echo "$ALL_PODS_CHECK" | grep -q "$STABLE_POD_NAME"; then echo "✅ Pod fully removed" break else echo "Pod still exists (attempt $verify_attempt/10), waiting..." sleep 10 fi done echo "✅ Proceeding with template and auth cleanup" else echo "No existing pod found" echo "pod_id=" >> $GITHUB_OUTPUT fi - name: Create or Update RunPod Template id: create-template env: RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} FULL_IMAGE: ${{ steps.image-tag.outputs.full_image }} ECR_CREDENTIALS: ${{ steps.ecr-credentials.outputs.ecr_credentials }} ECR_REGISTRY: ${{ steps.ecr-credentials.outputs.ecr_registry }} run: | TEMPLATE_NAME="${{ env.RUNPOD_TEMPLATE_NAME }}" # Get existing templates TEMPLATES_RESPONSE=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data '{"query":"query { myself { podTemplates { id name } } }"}') EXISTING_TEMPLATE_ID=$(echo "$TEMPLATES_RESPONSE" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "") TIMESTAMP=$(date +%s) if [ -n "$EXISTING_TEMPLATE_ID" ] && [ "$EXISTING_TEMPLATE_ID" != "null" ]; then echo "Found existing template: $EXISTING_TEMPLATE_ID" echo "Deleting old template..." # Try to delete the template (multiple attempts with delays) for attempt in {1..3}; do DELETE_RESPONSE=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data "{\"query\":\"mutation { deleteTemplate(templateId: \\\"$EXISTING_TEMPLATE_ID\\\") }\"}") sleep 5 # Verify template was deleted VERIFY_RESPONSE=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data '{"query":"query { myself { podTemplates { id name } } }"}') STILL_EXISTS=$(echo "$VERIFY_RESPONSE" | jq -r ".data.myself.podTemplates[] | select(.id == \"$EXISTING_TEMPLATE_ID\") | .id" 2>/dev/null || echo "") if [ -z "$STILL_EXISTS" ]; then echo "✅ Template deleted successfully" break else echo "⚠️ Template still exists (attempt $attempt/3), waiting longer..." sleep 10 fi done # If still exists after all attempts, use timestamp suffix FINAL_CHECK=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data '{"query":"query { myself { podTemplates { id name } } }"}') STILL_EXISTS_FINAL=$(echo "$FINAL_CHECK" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "") if [ -n "$STILL_EXISTS_FINAL" ]; then echo "⚠️ Template with name '$TEMPLATE_NAME' still exists, using timestamp suffix" TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}" echo "New template name: $TEMPLATE_NAME" fi fi # Create or update ECR authentication in RunPod AUTH_NAME="ecr-auth-ylff" AUTH_ID="" # Function to verify auth exists verify_auth_exists() { local auth_id_to_check="$1" if [ -z "$auth_id_to_check" ] || [ "$auth_id_to_check" = "null" ]; then return 1 fi VERIFY_AUTHS=$(curl -s --request GET \ --header 'Content-Type: application/json' \ --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ --url "https://rest.runpod.io/v1/containerregistryauth") VERIFY_ID=$(echo "$VERIFY_AUTHS" | jq -r ".[] | select(.id == \"$auth_id_to_check\") | .id" 2>/dev/null || echo "") [ -n "$VERIFY_ID" ] && [ "$VERIFY_ID" != "null" ] } # Check if auth already exists EXISTING_AUTHS=$(curl -s --request GET \ --header 'Content-Type: application/json' \ --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ --url "https://rest.runpod.io/v1/containerregistryauth") EXISTING_AUTH_ID=$(echo "$EXISTING_AUTHS" | jq -r ".[] | select(.name == \"$AUTH_NAME\") | .id" 2>/dev/null || echo "") if [ -n "$EXISTING_AUTH_ID" ] && [ "$EXISTING_AUTH_ID" != "null" ]; then echo "Found existing ECR auth: $EXISTING_AUTH_ID" # Verify it actually exists before trying to delete if verify_auth_exists "$EXISTING_AUTH_ID"; then echo "Verifying auth exists before deletion..." # Try to delete it, but handle errors gracefully DELETE_AUTH_HTTP_CODE=$(curl -s -o /tmp/auth_delete_response.txt -w "%{http_code}" --request DELETE \ --header 'Content-Type: application/json' \ --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ --url "https://rest.runpod.io/v1/containerregistryauth/$EXISTING_AUTH_ID") DELETE_AUTH_RESPONSE=$(cat /tmp/auth_delete_response.txt 2>/dev/null || echo "") # Check if deletion succeeded (204/200 are success codes) if [ "$DELETE_AUTH_HTTP_CODE" = "204" ] || [ "$DELETE_AUTH_HTTP_CODE" = "200" ]; then echo "✅ ECR auth deleted successfully (HTTP $DELETE_AUTH_HTTP_CODE)" # Save auth ID for verification before clearing EXISTING_AUTH_ID DELETED_AUTH_ID="$EXISTING_AUTH_ID" # Clear EXISTING_AUTH_ID immediately since deletion succeeded # This ensures we create a new auth instead of reusing the deleted one EXISTING_AUTH_ID="" # Wait and verify deletion (for informational/logging purposes) sleep 3 for verify_attempt in {1..5}; do if ! verify_auth_exists "$DELETED_AUTH_ID"; then echo "✅ Auth deletion verified (attempt $verify_attempt)" break else echo "⚠️ Auth still exists (attempt $verify_attempt/5), waiting..." sleep 2 fi done elif echo "$DELETE_AUTH_RESPONSE" | grep -qi "in use\|error\|failed"; then echo "⚠️ ECR auth deletion failed (HTTP $DELETE_AUTH_HTTP_CODE)" echo "Response: $DELETE_AUTH_RESPONSE" echo "Auth may be in use. Will create new auth with timestamp suffix" AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}" EXISTING_AUTH_ID="" else echo "⚠️ ECR auth deletion returned unexpected status (HTTP $DELETE_AUTH_HTTP_CODE)" echo "Response: $DELETE_AUTH_RESPONSE" echo "Will create new auth with timestamp suffix" AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}" EXISTING_AUTH_ID="" fi else echo "⚠️ Existing auth ID found but doesn't exist in RunPod, will create new one" EXISTING_AUTH_ID="" fi fi # Create new ECR auth (always create fresh to avoid stale references) echo "Creating new ECR auth: $AUTH_NAME" AUTH_RESPONSE=$(curl -s --request POST \ --header 'Content-Type: application/json' \ --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ --url "https://rest.runpod.io/v1/containerregistryauth" \ --data "{ \"name\": \"$AUTH_NAME\", \"username\": \"AWS\", \"password\": \"${ECR_CREDENTIALS}\" }") AUTH_ID=$(echo "$AUTH_RESPONSE" | jq -r '.id' 2>/dev/null || echo "") if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then ERROR_MSG=$(echo "$AUTH_RESPONSE" | jq -r '.message // .error // "Unknown error"' 2>/dev/null || echo "") echo "❌ Failed to create ECR auth" echo "Response: $AUTH_RESPONSE" echo "Error: $ERROR_MSG" # Try with timestamp suffix as fallback AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}" echo "Retrying with name: $AUTH_NAME" AUTH_RESPONSE=$(curl -s --request POST \ --header 'Content-Type: application/json' \ --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ --url "https://rest.runpod.io/v1/containerregistryauth" \ --data "{ \"name\": \"$AUTH_NAME\", \"username\": \"AWS\", \"password\": \"${ECR_CREDENTIALS}\" }") AUTH_ID=$(echo "$AUTH_RESPONSE" | jq -r '.id' 2>/dev/null || echo "") if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then echo "❌ Failed to create ECR auth even with timestamp suffix" echo "Response: $AUTH_RESPONSE" exit 1 fi fi # Verify the auth was created and exists echo "Verifying created ECR auth: $AUTH_ID" sleep 2 if verify_auth_exists "$AUTH_ID"; then echo "✅ ECR authentication verified: $AUTH_ID" else echo "⚠️ ECR auth created but verification failed, waiting longer..." sleep 5 if verify_auth_exists "$AUTH_ID"; then echo "✅ ECR authentication verified after wait: $AUTH_ID" else echo "❌ ECR auth verification failed after retry" echo "This may cause template creation to fail" fi fi # Final check: ensure template name is available before creating FINAL_TEMPLATES_CHECK=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data '{"query":"query { myself { podTemplates { id name } } }"}') NAME_EXISTS=$(echo "$FINAL_TEMPLATES_CHECK" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "") if [ -n "$NAME_EXISTS" ] && [ "$NAME_EXISTS" != "null" ]; then echo "⚠️ Template name '$TEMPLATE_NAME' still exists, using timestamp suffix" TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}" echo "Using new template name: $TEMPLATE_NAME" fi # Validate AUTH_ID before creating template if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then echo "❌ Cannot create template: ECR auth ID is missing" exit 1 fi # Verify auth still exists before using it if ! verify_auth_exists "$AUTH_ID"; then echo "❌ Cannot create template: ECR auth ID $AUTH_ID does not exist" echo "This may indicate a timing issue. Please retry the deployment." exit 1 fi # Create new template with ECR auth echo "Creating template: $TEMPLATE_NAME" echo "Using ECR auth ID: $AUTH_ID" echo "Using image: ${FULL_IMAGE}" CREATE_RESPONSE=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" }, { key: \\\"XDG_CACHE_HOME\\\", value: \\\"/workspace/.cache\\\" }, { key: \\\"HF_HOME\\\", value: \\\"/workspace/.cache/huggingface\\\" }, { key: \\\"HUGGINGFACE_HUB_CACHE\\\", value: \\\"/workspace/.cache/huggingface/hub\\\" }, { key: \\\"TRANSFORMERS_CACHE\\\", value: \\\"/workspace/.cache/huggingface/transformers\\\" }, { key: \\\"TORCH_HOME\\\", value: \\\"/workspace/.cache/torch\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}") TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "") ERROR_MSG=$(echo "$CREATE_RESPONSE" | jq -r '.errors[0].message' 2>/dev/null || echo "") ERROR_PATH=$(echo "$CREATE_RESPONSE" | jq -r '.errors[0].path[0]' 2>/dev/null || echo "") if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then echo "❌ Failed to create template" echo "Response: $CREATE_RESPONSE" if [ -n "$ERROR_MSG" ]; then echo "Error message: $ERROR_MSG" echo "Error path: $ERROR_PATH" # Handle specific error cases if echo "$ERROR_MSG" | grep -qi "Registry Auth not found\|containerRegistryAuthId"; then echo "❌ ECR auth ID $AUTH_ID not found in RunPod" echo "Attempting to verify auth existence..." if verify_auth_exists "$AUTH_ID"; then echo "⚠️ Auth exists but template creation failed. This may be a RunPod API issue." echo "Retrying template creation after delay..." sleep 5 # Retry once CREATE_RESPONSE=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}") TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "") if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then echo "❌ Retry also failed" exit 1 fi else echo "❌ Auth does not exist. Cannot create template." exit 1 fi elif echo "$ERROR_MSG" | grep -qi "unique\|already exists"; then echo "⚠️ Template name already exists, trying with timestamp suffix" TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}" # Try again with timestamp CREATE_RESPONSE=$(curl -s --request POST \ --header 'content-type: application/json' \ --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ --data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}") TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "") if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then echo "❌ Failed to create template even with timestamp suffix" echo "Response: $CREATE_RESPONSE" exit 1 fi else exit 1 fi else exit 1 fi fi echo "template_id=$TEMPLATE_ID" >> $GITHUB_OUTPUT echo "template_name=$TEMPLATE_NAME" >> $GITHUB_OUTPUT echo "✅ Template created/updated: $TEMPLATE_ID (name: $TEMPLATE_NAME)" - name: Deploy to RunPod - Create or Update Pod id: deploy-pod env: RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} FULL_IMAGE: ${{ steps.image-tag.outputs.full_image }} STABLE_POD_NAME: "ylff-dev-stable" run: | set -euo pipefail # Check if pod already exists EXISTING_POD_ID="" ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "") if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then EXISTING_POD_ID=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME" | awk '{print $1}') echo "Found existing pod: $EXISTING_POD_ID" # Stop and remove the pod echo "Stopping existing pod for update..." runpodctl stop pod "$EXISTING_POD_ID" || true sleep 10 echo "Removing old pod to deploy new version..." runpodctl remove pod "$EXISTING_POD_ID" || true sleep 15 else echo "No existing pod found, will create new one" fi sleep 10 # Create the pod echo "Creating pod: $STABLE_POD_NAME" echo "Using image: $FULL_IMAGE" echo "Using template: ${{ steps.create-template.outputs.template_id }}" runpodctl create pod \ --name="$STABLE_POD_NAME" \ --imageName="$FULL_IMAGE" \ --templateId="${{ steps.create-template.outputs.template_id }}" \ --gpuType="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_type || 'NVIDIA RTX A6000' }}" \ --gpuCount="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_count || '1' }}" \ --secureCloud \ --containerDiskSize=20 \ --mem=32 \ --vcpu=4 if [ $? -ne 0 ]; then echo "Failed to create pod, retrying once..." sleep 10 runpodctl create pod \ --name="$STABLE_POD_NAME" \ --imageName="$FULL_IMAGE" \ --templateId="${{ steps.create-template.outputs.template_id }}" \ --gpuType="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_type || 'NVIDIA RTX A6000' }}" \ --gpuCount="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_count || '1' }}" \ --secureCloud \ --containerDiskSize=20 \ --mem=32 \ --vcpu=4 if [ $? -ne 0 ]; then exit 1 fi fi # Wait for pod to initialize echo "Waiting for pod to initialize..." sleep 30 # Get pod details ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "") if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then POD_LINE=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME") POD_ID=$(echo "$POD_LINE" | awk '{print $1}') POD_STATUS=$(echo "$POD_LINE" | awk '{print $7}') POD_URL="https://${POD_ID}-8000.proxy.runpod.net" echo "✅ Pod created successfully!" echo " Pod Name: $STABLE_POD_NAME" echo " Pod ID: $POD_ID" echo " Status: $POD_STATUS" echo " Backend URL: $POD_URL" # Save pod details for summary echo "pod_id=${POD_ID}" >> $GITHUB_OUTPUT echo "pod_url=${POD_URL}" >> $GITHUB_OUTPUT echo "pod_status=${POD_STATUS}" >> $GITHUB_OUTPUT else echo "⚠️ Pod created but details not available yet" fi - name: Wait for deployed API health if: always() env: POD_URL: ${{ steps.deploy-pod.outputs.pod_url }} run: | set -e if [ -z "${POD_URL:-}" ]; then echo "No pod_url available; skipping health check." exit 0 fi URL="${POD_URL%/}/health" echo "Polling ${URL} ..." deadline=$(( $(date +%s) + 20*60 )) last="" while [ "$(date +%s)" -lt "$deadline" ]; do # -sS: quiet but show errors, -m: max time, -o /dev/null: no body, -w: print status code="$(curl -sS -m 10 -o /dev/null -w "%{http_code}" "${URL}" || true)" last="$code" if [ "$code" = "200" ]; then echo "Deployed API is healthy." exit 0 fi sleep 10 done echo "Timed out waiting for deployed /health: last_status=${last}" exit 1 - name: Add deployment summary if: always() run: | POD_ID="${{ steps.deploy-pod.outputs.pod_id }}" POD_URL="${{ steps.deploy-pod.outputs.pod_url }}" POD_STATUS="${{ steps.deploy-pod.outputs.pod_status }}" TEMPLATE_NAME="${{ steps.create-template.outputs.template_name }}" FULL_IMAGE="${{ steps.image-tag.outputs.full_image }}" { echo "## 🚀 YLFF Deployment Summary" echo "" echo "### Pod Information" if [ -n "$POD_ID" ]; then echo "- **Pod Name:** ylff-dev-stable" echo "- **Pod ID:** \`$POD_ID\`" echo "- **Status:** $POD_STATUS" echo "" echo "### 🔗 Connection URLs" echo "- **API Server:** [$POD_URL]($POD_URL)" echo "- **API Docs:** [$POD_URL/docs]($POD_URL/docs)" echo "- **Health Check:** [$POD_URL/health]($POD_URL/health)" echo "" else echo "⚠️ Pod details not available" echo "" fi echo "### 📦 Deployment Details" echo "- **Docker Image:** \`$FULL_IMAGE\`" echo "- **Template:** $TEMPLATE_NAME" echo "- **Template ID:** \`${{ steps.create-template.outputs.template_id }}\`" echo "" echo "### 📚 API Endpoints" echo "- \`GET /\` - API information" echo "- \`GET /health\` - Health check" echo "- \`GET /models\` - List available models" echo "- \`POST /api/v1/validate/sequence\` - Validate sequence" echo "- \`POST /api/v1/validate/arkit\` - Validate ARKit data" echo "- \`POST /api/v1/dataset/build\` - Build training dataset" echo "- \`POST /api/v1/train/start\` - Fine-tune model" echo "- \`POST /api/v1/train/pretrain\` - Pre-train on ARKit" echo "- \`POST /api/v1/eval/ba-agreement\` - Evaluate BA agreement" echo "- \`POST /api/v1/visualize\` - Visualize results" echo "- \`GET /api/v1/jobs\` - List all jobs" echo "- \`GET /api/v1/jobs/{job_id}\` - Get job status" } >> $GITHUB_STEP_SUMMARY