| | name: Deploy to RunPod |
| |
|
| | on: |
| | workflow_run: |
| | workflows: ["RunPod H100x1 Smoke Test"] |
| | types: |
| | - completed |
| | branches: |
| | - main |
| | - dev |
| | workflow_dispatch: |
| | inputs: |
| | image_tag: |
| | description: "Docker image tag to deploy" |
| | required: false |
| | default: "auto" |
| | gpu_type: |
| | description: "RunPod GPU type (e.g. NVIDIA RTX A6000, NVIDIA H100 PCIe)" |
| | required: false |
| | default: "NVIDIA RTX A6000" |
| | gpu_count: |
| | description: "GPU count" |
| | required: false |
| | default: "1" |
| |
|
| | env: |
| | AWS_REGION: us-east-1 |
| | ECR_REPOSITORY: ylff |
| | RUNPOD_TEMPLATE_NAME: "YLFF-Dev-Template" |
| |
|
| | concurrency: |
| | group: ${{ github.workflow }}-${{ github.ref }} |
| | cancel-in-progress: true |
| |
|
| | permissions: |
| | contents: read |
| | id-token: write |
| |
|
| | jobs: |
| | deploy: |
| | runs-on: ubuntu-latest |
| | if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'workflow_run' && github.event.workflow_run.conclusion == 'success') }} |
| |
|
| | steps: |
| | - name: Checkout repository |
| | uses: actions/checkout@v4 |
| |
|
| | - name: Set up Python |
| | uses: actions/setup-python@v5 |
| | with: |
| | python-version: "3.11" |
| |
|
| | - name: Cache pip packages |
| | uses: actions/cache@v4 |
| | with: |
| | path: ~/.cache/pip |
| | key: ${{ runner.os }}-pip-runpod-${{ hashFiles('**/requirements*.txt') }} |
| | restore-keys: | |
| | ${{ runner.os }}-pip-runpod- |
| | |
| | - name: Install RunPod CLI |
| | run: | |
| | set -e |
| | echo "Installing runpodctl from GitHub releases..." |
| | |
| | |
| | LATEST_VERSION=$(curl -s https://api.github.com/repos/Run-Pod/runpodctl/releases/latest | jq -r '.tag_name') |
| | if [ -z "$LATEST_VERSION" ] || [ "$LATEST_VERSION" = "null" ]; then |
| | echo "Failed to get latest version, using fallback version v1.14.3" |
| | LATEST_VERSION="v1.14.3" |
| | fi |
| |
|
| | echo "Installing runpodctl version: $LATEST_VERSION" |
| |
|
| | |
| | wget --quiet --show-progress \ |
| | "https://github.com/Run-Pod/runpodctl/releases/download/${LATEST_VERSION}/runpodctl-linux-amd64" \ |
| | -O runpodctl |
| |
|
| | |
| | chmod +x runpodctl |
| | sudo mv runpodctl /usr/local/bin/runpodctl |
| |
|
| | |
| | echo "Verifying runpodctl installation..." |
| | runpodctl version |
| | echo "runpodctl installed successfully" |
| |
|
| | - name: Configure RunPod |
| | env: |
| | RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} |
| | run: | |
| | echo "Configuring runpodctl with API key..." |
| | |
| | |
| | if runpodctl config --apiKey "${{ secrets.RUNPOD_API_KEY }}"; then |
| | echo "runpodctl configured successfully using config command" |
| | else |
| | echo "Config command failed, using manual YAML configuration..." |
| | |
| | mkdir -p ~/.runpod |
| | echo "apiKey: ${{ secrets.RUNPOD_API_KEY }}" > ~/.runpod/.runpod.yaml |
| | chmod 600 ~/.runpod/.runpod.yaml |
| | echo "Manual YAML configuration completed" |
| | fi |
| |
|
| | |
| | echo "Testing runpodctl configuration..." |
| | if runpodctl get pod --help > /dev/null 2>&1; then |
| | echo "runpodctl configuration verified successfully" |
| | else |
| | echo "Warning: runpodctl configuration verification failed, but continuing..." |
| | fi |
| |
|
| | - name: Configure AWS credentials |
| | uses: aws-actions/configure-aws-credentials@v4 |
| | with: |
| | role-to-assume: arn:aws:iam::211125621822:role/github-actions-role |
| | aws-region: ${{ env.AWS_REGION }} |
| | role-session-name: GitHubActionsSession |
| | output-credentials: true |
| |
|
| | - name: Login to Amazon ECR |
| | id: login-ecr |
| | uses: aws-actions/amazon-ecr-login@v2 |
| |
|
| | - name: Determine image tag |
| | id: image-tag |
| | run: | |
| | set -euo pipefail |
| | |
| | if [ "${{ github.event_name }}" = "workflow_run" ]; then |
| | IMAGE_TAG="auto" |
| | BRANCH="${{ github.event.workflow_run.head_branch }}" |
| | SHORT_SHA="$(echo "${{ github.event.workflow_run.head_sha }}" | cut -c1-7)" |
| | else |
| | IMAGE_TAG="${{ github.event.inputs.image_tag }}" |
| | BRANCH="${GITHUB_REF_NAME}" |
| | SHORT_SHA="${GITHUB_SHA::7}" |
| | fi |
| |
|
| | if [ -z "${IMAGE_TAG}" ]; then |
| | IMAGE_TAG="auto" |
| | fi |
| |
|
| | CANDIDATE_TAG="${BRANCH}-${SHORT_SHA}" |
| | if [ "${IMAGE_TAG}" = "latest" ] || [ "${IMAGE_TAG}" = "auto" ]; then |
| | if aws ecr describe-images \ |
| | --repository-name "${{ env.ECR_REPOSITORY }}" \ |
| | --image-ids "imageTag=${CANDIDATE_TAG}" \ |
| | --region "${{ env.AWS_REGION }}" >/dev/null 2>&1; then |
| | echo "Using immutable ECR tag: ${CANDIDATE_TAG}" |
| | IMAGE_TAG="${CANDIDATE_TAG}" |
| | else |
| | if [ "${IMAGE_TAG}" = "auto" ]; then |
| | IMAGE_TAG="latest" |
| | fi |
| | echo "Immutable tag not found (${CANDIDATE_TAG}); using tag: ${IMAGE_TAG}" |
| | fi |
| | fi |
| |
|
| | |
| | FULL_IMAGE="${{ steps.login-ecr.outputs.registry }}/${{ env.ECR_REPOSITORY }}:${IMAGE_TAG}" |
| |
|
| | echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT |
| | echo "full_image=${FULL_IMAGE}" >> $GITHUB_OUTPUT |
| | echo "Branch: ${BRANCH:-unknown}" |
| | echo "Using image: ${FULL_IMAGE}" |
| |
|
| | - name: Verify image exists in ECR |
| | run: | |
| | FULL_IMAGE="${{ steps.image-tag.outputs.full_image }}" |
| | IMAGE_TAG="${{ steps.image-tag.outputs.image_tag }}" |
| | |
| | echo "π Verifying image exists in ECR..." |
| | echo "Checking for: ${FULL_IMAGE}" |
| |
|
| | |
| | if aws ecr describe-images \ |
| | --repository-name ${{ env.ECR_REPOSITORY }} \ |
| | --image-ids imageTag=${IMAGE_TAG} \ |
| | --region ${{ env.AWS_REGION }} 2>/dev/null; then |
| | echo "β
Image found in ECR with tag: ${IMAGE_TAG}" |
| | else |
| | echo "β Image not found with tag: ${IMAGE_TAG}" |
| | echo "π Checking available tags..." |
| |
|
| | |
| | AVAILABLE_TAGS=$(aws ecr describe-images \ |
| | --repository-name ${{ env.ECR_REPOSITORY }} \ |
| | --region ${{ env.AWS_REGION }} \ |
| | --query 'imageDetails[*].imageTags[*]' \ |
| | --output text 2>/dev/null || echo "") |
| |
|
| | if [ -n "$AVAILABLE_TAGS" ]; then |
| | echo "Available tags in ECR:" |
| | echo "$AVAILABLE_TAGS" |
| | else |
| | echo "No tags found in ECR repository" |
| | fi |
| |
|
| | echo "β οΈ Continuing anyway - image may be available or will be created" |
| | fi |
| |
|
| | - name: Get ECR credentials for RunPod |
| | id: ecr-credentials |
| | run: | |
| | echo "π Getting ECR credentials for RunPod authentication..." |
| | ECR_CREDENTIALS=$(aws ecr get-login-password --region ${{ env.AWS_REGION }}) |
| | echo "ecr_credentials=${ECR_CREDENTIALS}" >> $GITHUB_OUTPUT |
| | echo "ecr_registry=${{ steps.login-ecr.outputs.registry }}" >> $GITHUB_OUTPUT |
| | echo "β
ECR credentials retrieved" |
| | |
| | - name: Stop and Remove Existing Pod |
| | id: stop-pod |
| | env: |
| | RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} |
| | STABLE_POD_NAME: "ylff-dev-stable" |
| | run: | |
| | echo "π Checking for existing pod: $STABLE_POD_NAME" |
| | |
| | ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "") |
| |
|
| | if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then |
| | EXISTING_POD_ID=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME" | awk '{print $1}') |
| | echo "Found existing pod: $EXISTING_POD_ID" |
| | echo "pod_id=${EXISTING_POD_ID}" >> $GITHUB_OUTPUT |
| |
|
| | |
| | echo "Stopping pod..." |
| | runpodctl stop pod "$EXISTING_POD_ID" || true |
| | sleep 20 |
| |
|
| | |
| | echo "Removing pod..." |
| | runpodctl remove pod "$EXISTING_POD_ID" || true |
| | sleep 20 |
| |
|
| | |
| | echo "Verifying pod removal..." |
| | for verify_attempt in {1..10}; do |
| | ALL_PODS_CHECK=$(runpodctl get pod --allfields 2>/dev/null || echo "") |
| | if ! echo "$ALL_PODS_CHECK" | grep -q "$STABLE_POD_NAME"; then |
| | echo "β
Pod fully removed" |
| | break |
| | else |
| | echo "Pod still exists (attempt $verify_attempt/10), waiting..." |
| | sleep 10 |
| | fi |
| | done |
| |
|
| | echo "β
Proceeding with template and auth cleanup" |
| | else |
| | echo "No existing pod found" |
| | echo "pod_id=" >> $GITHUB_OUTPUT |
| | fi |
| |
|
| | - name: Create or Update RunPod Template |
| | id: create-template |
| | env: |
| | RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} |
| | FULL_IMAGE: ${{ steps.image-tag.outputs.full_image }} |
| | ECR_CREDENTIALS: ${{ steps.ecr-credentials.outputs.ecr_credentials }} |
| | ECR_REGISTRY: ${{ steps.ecr-credentials.outputs.ecr_registry }} |
| | run: | |
| | TEMPLATE_NAME="${{ env.RUNPOD_TEMPLATE_NAME }}" |
| | |
| | |
| | TEMPLATES_RESPONSE=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data '{"query":"query { myself { podTemplates { id name } } }"}') |
| |
|
| | EXISTING_TEMPLATE_ID=$(echo "$TEMPLATES_RESPONSE" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "") |
| |
|
| | TIMESTAMP=$(date +%s) |
| |
|
| | if [ -n "$EXISTING_TEMPLATE_ID" ] && [ "$EXISTING_TEMPLATE_ID" != "null" ]; then |
| | echo "Found existing template: $EXISTING_TEMPLATE_ID" |
| | echo "Deleting old template..." |
| |
|
| | |
| | for attempt in {1..3}; do |
| | DELETE_RESPONSE=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data "{\"query\":\"mutation { deleteTemplate(templateId: \\\"$EXISTING_TEMPLATE_ID\\\") }\"}") |
| |
|
| | sleep 5 |
| |
|
| | |
| | VERIFY_RESPONSE=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data '{"query":"query { myself { podTemplates { id name } } }"}') |
| |
|
| | STILL_EXISTS=$(echo "$VERIFY_RESPONSE" | jq -r ".data.myself.podTemplates[] | select(.id == \"$EXISTING_TEMPLATE_ID\") | .id" 2>/dev/null || echo "") |
| |
|
| | if [ -z "$STILL_EXISTS" ]; then |
| | echo "β
Template deleted successfully" |
| | break |
| | else |
| | echo "β οΈ Template still exists (attempt $attempt/3), waiting longer..." |
| | sleep 10 |
| | fi |
| | done |
| |
|
| | |
| | FINAL_CHECK=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data '{"query":"query { myself { podTemplates { id name } } }"}') |
| |
|
| | STILL_EXISTS_FINAL=$(echo "$FINAL_CHECK" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "") |
| |
|
| | if [ -n "$STILL_EXISTS_FINAL" ]; then |
| | echo "β οΈ Template with name '$TEMPLATE_NAME' still exists, using timestamp suffix" |
| | TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}" |
| | echo "New template name: $TEMPLATE_NAME" |
| | fi |
| | fi |
| |
|
| | |
| | AUTH_NAME="ecr-auth-ylff" |
| | AUTH_ID="" |
| |
|
| | |
| | verify_auth_exists() { |
| | local auth_id_to_check="$1" |
| | if [ -z "$auth_id_to_check" ] || [ "$auth_id_to_check" = "null" ]; then |
| | return 1 |
| | fi |
| | VERIFY_AUTHS=$(curl -s --request GET \ |
| | --header 'Content-Type: application/json' \ |
| | --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ |
| | --url "https://rest.runpod.io/v1/containerregistryauth") |
| | VERIFY_ID=$(echo "$VERIFY_AUTHS" | jq -r ".[] | select(.id == \"$auth_id_to_check\") | .id" 2>/dev/null || echo "") |
| | [ -n "$VERIFY_ID" ] && [ "$VERIFY_ID" != "null" ] |
| | } |
| |
|
| | |
| | EXISTING_AUTHS=$(curl -s --request GET \ |
| | --header 'Content-Type: application/json' \ |
| | --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ |
| | --url "https://rest.runpod.io/v1/containerregistryauth") |
| |
|
| | EXISTING_AUTH_ID=$(echo "$EXISTING_AUTHS" | jq -r ".[] | select(.name == \"$AUTH_NAME\") | .id" 2>/dev/null || echo "") |
| |
|
| | if [ -n "$EXISTING_AUTH_ID" ] && [ "$EXISTING_AUTH_ID" != "null" ]; then |
| | echo "Found existing ECR auth: $EXISTING_AUTH_ID" |
| |
|
| | |
| | if verify_auth_exists "$EXISTING_AUTH_ID"; then |
| | echo "Verifying auth exists before deletion..." |
| |
|
| | |
| | DELETE_AUTH_HTTP_CODE=$(curl -s -o /tmp/auth_delete_response.txt -w "%{http_code}" --request DELETE \ |
| | --header 'Content-Type: application/json' \ |
| | --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ |
| | --url "https://rest.runpod.io/v1/containerregistryauth/$EXISTING_AUTH_ID") |
| |
|
| | DELETE_AUTH_RESPONSE=$(cat /tmp/auth_delete_response.txt 2>/dev/null || echo "") |
| |
|
| | |
| | if [ "$DELETE_AUTH_HTTP_CODE" = "204" ] || [ "$DELETE_AUTH_HTTP_CODE" = "200" ]; then |
| | echo "β
ECR auth deleted successfully (HTTP $DELETE_AUTH_HTTP_CODE)" |
| | |
| | DELETED_AUTH_ID="$EXISTING_AUTH_ID" |
| | |
| | |
| | EXISTING_AUTH_ID="" |
| | |
| | sleep 3 |
| | for verify_attempt in {1..5}; do |
| | if ! verify_auth_exists "$DELETED_AUTH_ID"; then |
| | echo "β
Auth deletion verified (attempt $verify_attempt)" |
| | break |
| | else |
| | echo "β οΈ Auth still exists (attempt $verify_attempt/5), waiting..." |
| | sleep 2 |
| | fi |
| | done |
| | elif echo "$DELETE_AUTH_RESPONSE" | grep -qi "in use\|error\|failed"; then |
| | echo "β οΈ ECR auth deletion failed (HTTP $DELETE_AUTH_HTTP_CODE)" |
| | echo "Response: $DELETE_AUTH_RESPONSE" |
| | echo "Auth may be in use. Will create new auth with timestamp suffix" |
| | AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}" |
| | EXISTING_AUTH_ID="" |
| | else |
| | echo "β οΈ ECR auth deletion returned unexpected status (HTTP $DELETE_AUTH_HTTP_CODE)" |
| | echo "Response: $DELETE_AUTH_RESPONSE" |
| | echo "Will create new auth with timestamp suffix" |
| | AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}" |
| | EXISTING_AUTH_ID="" |
| | fi |
| | else |
| | echo "β οΈ Existing auth ID found but doesn't exist in RunPod, will create new one" |
| | EXISTING_AUTH_ID="" |
| | fi |
| | fi |
| |
|
| | |
| | echo "Creating new ECR auth: $AUTH_NAME" |
| | AUTH_RESPONSE=$(curl -s --request POST \ |
| | --header 'Content-Type: application/json' \ |
| | --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ |
| | --url "https://rest.runpod.io/v1/containerregistryauth" \ |
| | --data "{ |
| | \"name\": \"$AUTH_NAME\", |
| | \"username\": \"AWS\", |
| | \"password\": \"${ECR_CREDENTIALS}\" |
| | }") |
| |
|
| | AUTH_ID=$(echo "$AUTH_RESPONSE" | jq -r '.id' 2>/dev/null || echo "") |
| |
|
| | if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then |
| | ERROR_MSG=$(echo "$AUTH_RESPONSE" | jq -r '.message // .error // "Unknown error"' 2>/dev/null || echo "") |
| | echo "β Failed to create ECR auth" |
| | echo "Response: $AUTH_RESPONSE" |
| | echo "Error: $ERROR_MSG" |
| |
|
| | |
| | AUTH_NAME="ecr-auth-ylff-${TIMESTAMP}" |
| | echo "Retrying with name: $AUTH_NAME" |
| | AUTH_RESPONSE=$(curl -s --request POST \ |
| | --header 'Content-Type: application/json' \ |
| | --header "Authorization: Bearer ${RUNPOD_API_KEY}" \ |
| | --url "https://rest.runpod.io/v1/containerregistryauth" \ |
| | --data "{ |
| | \"name\": \"$AUTH_NAME\", |
| | \"username\": \"AWS\", |
| | \"password\": \"${ECR_CREDENTIALS}\" |
| | }") |
| |
|
| | AUTH_ID=$(echo "$AUTH_RESPONSE" | jq -r '.id' 2>/dev/null || echo "") |
| |
|
| | if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then |
| | echo "β Failed to create ECR auth even with timestamp suffix" |
| | echo "Response: $AUTH_RESPONSE" |
| | exit 1 |
| | fi |
| | fi |
| |
|
| | |
| | echo "Verifying created ECR auth: $AUTH_ID" |
| | sleep 2 |
| | if verify_auth_exists "$AUTH_ID"; then |
| | echo "β
ECR authentication verified: $AUTH_ID" |
| | else |
| | echo "β οΈ ECR auth created but verification failed, waiting longer..." |
| | sleep 5 |
| | if verify_auth_exists "$AUTH_ID"; then |
| | echo "β
ECR authentication verified after wait: $AUTH_ID" |
| | else |
| | echo "β ECR auth verification failed after retry" |
| | echo "This may cause template creation to fail" |
| | fi |
| | fi |
| |
|
| | |
| | FINAL_TEMPLATES_CHECK=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data '{"query":"query { myself { podTemplates { id name } } }"}') |
| |
|
| | NAME_EXISTS=$(echo "$FINAL_TEMPLATES_CHECK" | jq -r ".data.myself.podTemplates[] | select(.name == \"$TEMPLATE_NAME\") | .id" 2>/dev/null || echo "") |
| |
|
| | if [ -n "$NAME_EXISTS" ] && [ "$NAME_EXISTS" != "null" ]; then |
| | echo "β οΈ Template name '$TEMPLATE_NAME' still exists, using timestamp suffix" |
| | TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}" |
| | echo "Using new template name: $TEMPLATE_NAME" |
| | fi |
| |
|
| | |
| | if [ -z "$AUTH_ID" ] || [ "$AUTH_ID" = "null" ]; then |
| | echo "β Cannot create template: ECR auth ID is missing" |
| | exit 1 |
| | fi |
| |
|
| | |
| | if ! verify_auth_exists "$AUTH_ID"; then |
| | echo "β Cannot create template: ECR auth ID $AUTH_ID does not exist" |
| | echo "This may indicate a timing issue. Please retry the deployment." |
| | exit 1 |
| | fi |
| |
|
| | |
| | echo "Creating template: $TEMPLATE_NAME" |
| | echo "Using ECR auth ID: $AUTH_ID" |
| | echo "Using image: ${FULL_IMAGE}" |
| |
|
| | CREATE_RESPONSE=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" }, { key: \\\"XDG_CACHE_HOME\\\", value: \\\"/workspace/.cache\\\" }, { key: \\\"HF_HOME\\\", value: \\\"/workspace/.cache/huggingface\\\" }, { key: \\\"HUGGINGFACE_HUB_CACHE\\\", value: \\\"/workspace/.cache/huggingface/hub\\\" }, { key: \\\"TRANSFORMERS_CACHE\\\", value: \\\"/workspace/.cache/huggingface/transformers\\\" }, { key: \\\"TORCH_HOME\\\", value: \\\"/workspace/.cache/torch\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}") |
| |
|
| | TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "") |
| | ERROR_MSG=$(echo "$CREATE_RESPONSE" | jq -r '.errors[0].message' 2>/dev/null || echo "") |
| | ERROR_PATH=$(echo "$CREATE_RESPONSE" | jq -r '.errors[0].path[0]' 2>/dev/null || echo "") |
| |
|
| | if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then |
| | echo "β Failed to create template" |
| | echo "Response: $CREATE_RESPONSE" |
| |
|
| | if [ -n "$ERROR_MSG" ]; then |
| | echo "Error message: $ERROR_MSG" |
| | echo "Error path: $ERROR_PATH" |
| |
|
| | |
| | if echo "$ERROR_MSG" | grep -qi "Registry Auth not found\|containerRegistryAuthId"; then |
| | echo "β ECR auth ID $AUTH_ID not found in RunPod" |
| | echo "Attempting to verify auth existence..." |
| | if verify_auth_exists "$AUTH_ID"; then |
| | echo "β οΈ Auth exists but template creation failed. This may be a RunPod API issue." |
| | echo "Retrying template creation after delay..." |
| | sleep 5 |
| |
|
| | |
| | CREATE_RESPONSE=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}") |
| |
|
| | TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "") |
| | if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then |
| | echo "β Retry also failed" |
| | exit 1 |
| | fi |
| | else |
| | echo "β Auth does not exist. Cannot create template." |
| | exit 1 |
| | fi |
| | elif echo "$ERROR_MSG" | grep -qi "unique\|already exists"; then |
| | echo "β οΈ Template name already exists, trying with timestamp suffix" |
| | TEMPLATE_NAME="${TEMPLATE_NAME}-${TIMESTAMP}" |
| |
|
| | |
| | CREATE_RESPONSE=$(curl -s --request POST \ |
| | --header 'content-type: application/json' \ |
| | --url "https://api.runpod.io/graphql?api_key=${RUNPOD_API_KEY}" \ |
| | --data "{\"query\":\"mutation { saveTemplate(input: { containerDiskInGb: 10, dockerArgs: \\\"python -m uvicorn ylff.app:api_app --host 0.0.0.0 --port 8000\\\", env: [ { key: \\\"PYTHONUNBUFFERED\\\", value: \\\"1\\\" }, { key: \\\"PYTHONPATH\\\", value: \\\"/app\\\" } ], imageName: \\\"${FULL_IMAGE}\\\", name: \\\"$TEMPLATE_NAME\\\", ports: \\\"22/tcp,8000/http\\\", readme: \\\"## YLFF Template\\\\nTemplate for running YLFF API server on port 8000\\\", volumeInGb: 20, volumeMountPath: \\\"/workspace\\\", containerRegistryAuthId: \\\"$AUTH_ID\\\" }) { id } }\"}") |
| |
|
| | TEMPLATE_ID=$(echo "$CREATE_RESPONSE" | jq -r '.data.saveTemplate.id' 2>/dev/null || echo "") |
| | if [ -z "$TEMPLATE_ID" ] || [ "$TEMPLATE_ID" = "null" ]; then |
| | echo "β Failed to create template even with timestamp suffix" |
| | echo "Response: $CREATE_RESPONSE" |
| | exit 1 |
| | fi |
| | else |
| | exit 1 |
| | fi |
| | else |
| | exit 1 |
| | fi |
| | fi |
| |
|
| | echo "template_id=$TEMPLATE_ID" >> $GITHUB_OUTPUT |
| | echo "template_name=$TEMPLATE_NAME" >> $GITHUB_OUTPUT |
| | echo "β
Template created/updated: $TEMPLATE_ID (name: $TEMPLATE_NAME)" |
| |
|
| | - name: Deploy to RunPod - Create or Update Pod |
| | id: deploy-pod |
| | env: |
| | RUNPOD_API_KEY: ${{ secrets.RUNPOD_API_KEY }} |
| | FULL_IMAGE: ${{ steps.image-tag.outputs.full_image }} |
| | STABLE_POD_NAME: "ylff-dev-stable" |
| | run: | |
| | set -euo pipefail |
| | # Check if pod already exists |
| | EXISTING_POD_ID="" |
| | ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "") |
| | |
| | if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then |
| | EXISTING_POD_ID=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME" | awk '{print $1}') |
| | echo "Found existing pod: $EXISTING_POD_ID" |
| |
|
| | |
| | echo "Stopping existing pod for update..." |
| | runpodctl stop pod "$EXISTING_POD_ID" || true |
| | sleep 10 |
| |
|
| | echo "Removing old pod to deploy new version..." |
| | runpodctl remove pod "$EXISTING_POD_ID" || true |
| | sleep 15 |
| | else |
| | echo "No existing pod found, will create new one" |
| | fi |
| |
|
| | sleep 10 |
| |
|
| | |
| | echo "Creating pod: $STABLE_POD_NAME" |
| | echo "Using image: $FULL_IMAGE" |
| | echo "Using template: ${{ steps.create-template.outputs.template_id }}" |
| |
|
| | runpodctl create pod \ |
| | --name="$STABLE_POD_NAME" \ |
| | --imageName="$FULL_IMAGE" \ |
| | --templateId="${{ steps.create-template.outputs.template_id }}" \ |
| | --gpuType="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_type || 'NVIDIA RTX A6000' }}" \ |
| | --gpuCount="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_count || '1' }}" \ |
| | --secureCloud \ |
| | --containerDiskSize=20 \ |
| | --mem=32 \ |
| | --vcpu=4 |
| |
|
| | if [ $? -ne 0 ]; then |
| | echo "Failed to create pod, retrying once..." |
| | sleep 10 |
| | runpodctl create pod \ |
| | --name="$STABLE_POD_NAME" \ |
| | --imageName="$FULL_IMAGE" \ |
| | --templateId="${{ steps.create-template.outputs.template_id }}" \ |
| | --gpuType="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_type || 'NVIDIA RTX A6000' }}" \ |
| | --gpuCount="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.gpu_count || '1' }}" \ |
| | --secureCloud \ |
| | --containerDiskSize=20 \ |
| | --mem=32 \ |
| | --vcpu=4 |
| |
|
| | if [ $? -ne 0 ]; then |
| | exit 1 |
| | fi |
| | fi |
| |
|
| | |
| | echo "Waiting for pod to initialize..." |
| | sleep 30 |
| |
|
| | |
| | ALL_PODS_OUTPUT=$(runpodctl get pod --allfields 2>/dev/null || echo "") |
| | if echo "$ALL_PODS_OUTPUT" | grep -q "$STABLE_POD_NAME"; then |
| | POD_LINE=$(echo "$ALL_PODS_OUTPUT" | grep "$STABLE_POD_NAME") |
| | POD_ID=$(echo "$POD_LINE" | awk '{print $1}') |
| | POD_STATUS=$(echo "$POD_LINE" | awk '{print $7}') |
| | POD_URL="https://${POD_ID}-8000.proxy.runpod.net" |
| |
|
| | echo "β
Pod created successfully!" |
| | echo " Pod Name: $STABLE_POD_NAME" |
| | echo " Pod ID: $POD_ID" |
| | echo " Status: $POD_STATUS" |
| | echo " Backend URL: $POD_URL" |
| |
|
| | |
| | echo "pod_id=${POD_ID}" >> $GITHUB_OUTPUT |
| | echo "pod_url=${POD_URL}" >> $GITHUB_OUTPUT |
| | echo "pod_status=${POD_STATUS}" >> $GITHUB_OUTPUT |
| | else |
| | echo "β οΈ Pod created but details not available yet" |
| | fi |
| |
|
| | - name: Wait for deployed API health |
| | if: always() |
| | env: |
| | POD_URL: ${{ steps.deploy-pod.outputs.pod_url }} |
| | run: | |
| | set -e |
| | if [ -z "${POD_URL:-}" ]; then |
| | echo "No pod_url available; skipping health check." |
| | exit 0 |
| | fi |
| | URL="${POD_URL%/}/health" |
| | echo "Polling ${URL} ..." |
| | deadline=$(( $(date +%s) + 20*60 )) |
| | last="" |
| | while [ "$(date +%s)" -lt "$deadline" ]; do |
| | # -sS: quiet but show errors, -m: max time, -o /dev/null: no body, -w: print status |
| | code="$(curl -sS -m 10 -o /dev/null -w "%{http_code}" "${URL}" || true)" |
| | last="$code" |
| | if [ "$code" = "200" ]; then |
| | echo "Deployed API is healthy." |
| | exit 0 |
| | fi |
| | sleep 10 |
| | done |
| | echo "Timed out waiting for deployed /health: last_status=${last}" |
| | exit 1 |
| | |
| | - name: Add deployment summary |
| | if: always() |
| | run: | |
| | POD_ID="${{ steps.deploy-pod.outputs.pod_id }}" |
| | POD_URL="${{ steps.deploy-pod.outputs.pod_url }}" |
| | POD_STATUS="${{ steps.deploy-pod.outputs.pod_status }}" |
| | TEMPLATE_NAME="${{ steps.create-template.outputs.template_name }}" |
| | FULL_IMAGE="${{ steps.image-tag.outputs.full_image }}" |
| | |
| | { |
| | echo "## π YLFF Deployment Summary" |
| | echo "" |
| | echo "### Pod Information" |
| | if [ -n "$POD_ID" ]; then |
| | echo "- **Pod Name:** ylff-dev-stable" |
| | echo "- **Pod ID:** \`$POD_ID\`" |
| | echo "- **Status:** $POD_STATUS" |
| | echo "" |
| | echo "### π Connection URLs" |
| | echo "- **API Server:** [$POD_URL]($POD_URL)" |
| | echo "- **API Docs:** [$POD_URL/docs]($POD_URL/docs)" |
| | echo "- **Health Check:** [$POD_URL/health]($POD_URL/health)" |
| | echo "" |
| | else |
| | echo "β οΈ Pod details not available" |
| | echo "" |
| | fi |
| | echo "### π¦ Deployment Details" |
| | echo "- **Docker Image:** \`$FULL_IMAGE\`" |
| | echo "- **Template:** $TEMPLATE_NAME" |
| | echo "- **Template ID:** \`${{ steps.create-template.outputs.template_id }}\`" |
| | echo "" |
| | echo "### π API Endpoints" |
| | echo "- \`GET /\` - API information" |
| | echo "- \`GET /health\` - Health check" |
| | echo "- \`GET /models\` - List available models" |
| | echo "- \`POST /api/v1/validate/sequence\` - Validate sequence" |
| | echo "- \`POST /api/v1/validate/arkit\` - Validate ARKit data" |
| | echo "- \`POST /api/v1/dataset/build\` - Build training dataset" |
| | echo "- \`POST /api/v1/train/start\` - Fine-tune model" |
| | echo "- \`POST /api/v1/train/pretrain\` - Pre-train on ARKit" |
| | echo "- \`POST /api/v1/eval/ba-agreement\` - Evaluate BA agreement" |
| | echo "- \`POST /api/v1/visualize\` - Visualize results" |
| | echo "- \`GET /api/v1/jobs\` - List all jobs" |
| | echo "- \`GET /api/v1/jobs/{job_id}\` - Get job status" |
| | } >> $GITHUB_STEP_SUMMARY |
| |
|