|
|
name: Training Bootc image builds |
|
|
|
|
|
on: |
|
|
push: |
|
|
branches: [ main ] |
|
|
paths: |
|
|
- 'training/**' |
|
|
- '.github/workflows/training_bootc.yaml' |
|
|
|
|
|
workflow_dispatch: |
|
|
|
|
|
concurrency: |
|
|
group: ${{ github.workflow }} |
|
|
cancel-in-progress: false |
|
|
|
|
|
env: |
|
|
REGISTRY: quay.io |
|
|
REGISTRY_ORG: ai-lab |
|
|
REGION: us-east-1 |
|
|
|
|
|
jobs: |
|
|
start-runner: |
|
|
name: Start self-hosted EC2 runner |
|
|
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" |
|
|
runs-on: ubuntu-latest |
|
|
outputs: |
|
|
label: ${{ steps.start-ec2-runner.outputs.label }} |
|
|
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} |
|
|
steps: |
|
|
- name: Configure AWS credentials |
|
|
uses: aws-actions/configure-aws-credentials@v1 |
|
|
with: |
|
|
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} |
|
|
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} |
|
|
aws-region: ${{ env.REGION }} |
|
|
- name: Start EC2 runner |
|
|
id: start-ec2-runner |
|
|
uses: machulav/ec2-github-runner@v2 |
|
|
with: |
|
|
mode: start |
|
|
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} |
|
|
ec2-image-id: ami-0154957ba4ce98784 |
|
|
ec2-instance-type: m7i.12xlarge |
|
|
subnet-id: subnet-0b1e1d94240813658 |
|
|
security-group-id: sg-055105753f5e8bd83 |
|
|
|
|
|
nvidia-bootc-builder-image: |
|
|
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" |
|
|
strategy: |
|
|
matrix: |
|
|
include: |
|
|
- image_name: nvidia-builder |
|
|
context: training/nvidia-bootc |
|
|
arch: amd64 |
|
|
runs-on: ${{ needs.start-runner.outputs.label }} |
|
|
needs: start-runner |
|
|
permissions: |
|
|
contents: read |
|
|
packages: write |
|
|
steps: |
|
|
- uses: actions/checkout@v4.1.7 |
|
|
|
|
|
- name: mkdir root/.docker directory |
|
|
run: | |
|
|
mkdir -p ~/.docker |
|
|
|
|
|
- name: Login to Container Registry |
|
|
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }} |
|
|
|
|
|
- name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE |
|
|
run: | |
|
|
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N "" |
|
|
|
|
|
- name: Build Image |
|
|
id: build_image |
|
|
run: make driver-toolkit ARCH=${{ matrix.arch }} |
|
|
working-directory: ${{ matrix.context }} |
|
|
|
|
|
- name: tag image as nvidia-builder |
|
|
run: podman tag ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}/driver-toolkit:latest ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}/${{ matrix.image_name}}:latest |
|
|
|
|
|
- name: Push image |
|
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main' |
|
|
uses: redhat-actions/push-to-registry@v2.8 |
|
|
with: |
|
|
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} |
|
|
image: driver-toolkit |
|
|
tags: latest |
|
|
|
|
|
- name: push the nvidia-builder image |
|
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main' |
|
|
uses: redhat-actions/push-to-registry@v2.8 |
|
|
with: |
|
|
image: ${{ matrix.image_name}} |
|
|
tags: latest |
|
|
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} |
|
|
|
|
|
- name: Publish Job Results to Slack |
|
|
id: slack |
|
|
if: always() |
|
|
uses: slackapi/slack-github-action@v1.26.0 |
|
|
with: |
|
|
payload: | |
|
|
{ |
|
|
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |
|
|
} |
|
|
env: |
|
|
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} |
|
|
|
|
|
nvidia-bootc-image: |
|
|
strategy: |
|
|
matrix: |
|
|
include: |
|
|
- image_name: nvidia-bootc |
|
|
driver_version: "550.54.15" |
|
|
context: training/nvidia-bootc |
|
|
arch: amd64 |
|
|
runs-on: ${{ needs.start-runner.outputs.label }} |
|
|
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" |
|
|
needs: [ nvidia-bootc-builder-image, start-runner ] |
|
|
steps: |
|
|
- uses: actions/checkout@v4.1.7 |
|
|
|
|
|
- name: mkdir root/.docker directory |
|
|
run: | |
|
|
mkdir -p ~/.docker |
|
|
|
|
|
- name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE and overwrite the existing one |
|
|
run: | |
|
|
ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N "" <<<y |
|
|
|
|
|
- name: Login to Container Registry |
|
|
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }} |
|
|
|
|
|
|
|
|
- name: generate the local OCI assets |
|
|
run: | |
|
|
cd training |
|
|
make -j vllm |
|
|
make -j deepspeed |
|
|
make -j instruct-nvidia |
|
|
|
|
|
- name: Build Image |
|
|
id: build_image |
|
|
run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }} SSH_PUBKEY=~/.ssh/id_rsa.pub |
|
|
working-directory: ${{ matrix.context }} |
|
|
|
|
|
- name: Push image |
|
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main' |
|
|
uses: redhat-actions/push-to-registry@v2.8 |
|
|
with: |
|
|
image: ${{ matrix.image_name }} |
|
|
tags: latest |
|
|
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} |
|
|
|
|
|
- name: Publish Job Results to Slack |
|
|
id: slack |
|
|
if: always() |
|
|
uses: slackapi/slack-github-action@v1.26.0 |
|
|
with: |
|
|
payload: | |
|
|
{ |
|
|
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |
|
|
} |
|
|
env: |
|
|
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} |
|
|
|
|
|
bootc-images: |
|
|
strategy: |
|
|
matrix: |
|
|
include: |
|
|
- image_name: intel-bootc |
|
|
context: training/intel-bootc |
|
|
arch: amd64 |
|
|
gpu: intel |
|
|
- image_name: amd-bootc |
|
|
context: training/amd-bootc |
|
|
arch: amd64 |
|
|
gpu: amd |
|
|
if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" |
|
|
runs-on: ${{ needs.start-runner.outputs.label }} |
|
|
needs: [ start-runner, nvidia-bootc-builder-image ] |
|
|
continue-on-error: true |
|
|
steps: |
|
|
- uses: actions/checkout@v4.1.7 |
|
|
|
|
|
- name: mkdir root/.docker directory |
|
|
run: | |
|
|
mkdir -p ~/.docker |
|
|
|
|
|
- name: Login to Container Registry |
|
|
run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }} |
|
|
|
|
|
- name: generate the local OCI assets |
|
|
run: | |
|
|
cd training |
|
|
make -j vllm |
|
|
make -j deepspeed |
|
|
make -j instruct-${{ matrix.gpu}} |
|
|
|
|
|
- name: Build Image |
|
|
id: build_image |
|
|
run: make bootc ARCH=${{ matrix.arch }} INSTRUCTLAB_IMAGE=${{env.REGISTRY}}/${{env.REGISTRY_ORG}}/instruct-${{ matrix.gpu }}:latest |
|
|
working-directory: ${{ matrix.context }} |
|
|
|
|
|
- name: Push image |
|
|
if: github.event_name == 'push' && github.ref == 'refs/heads/main' |
|
|
uses: redhat-actions/push-to-registry@v2.8 |
|
|
with: |
|
|
image: ${{ matrix.image_name }} |
|
|
tags: latest |
|
|
registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} |
|
|
|
|
|
- name: Publish Job Results to Slack |
|
|
id: slack |
|
|
if: always() |
|
|
uses: slackapi/slack-github-action@v1.26.0 |
|
|
with: |
|
|
payload: | |
|
|
{ |
|
|
"text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |
|
|
} |
|
|
env: |
|
|
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} |
|
|
|
|
|
stop-runner: |
|
|
name: Stop self-hosted EC2 runner |
|
|
needs: |
|
|
- start-runner |
|
|
- bootc-images |
|
|
- nvidia-bootc-image |
|
|
runs-on: ubuntu-latest |
|
|
if: ${{ always() }} |
|
|
steps: |
|
|
- name: Configure AWS credentials |
|
|
uses: aws-actions/configure-aws-credentials@v1 |
|
|
with: |
|
|
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} |
|
|
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} |
|
|
aws-region: ${{ secrets.AWS_REGION }} |
|
|
- name: Stop EC2 runner |
|
|
uses: machulav/ec2-github-runner@v2 |
|
|
with: |
|
|
mode: stop |
|
|
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} |
|
|
label: ${{ needs.start-runner.outputs.label }} |
|
|
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |