Spaces:
No application file
No application file
| name: Training Bootc image builds | |
| on: | |
| push: | |
| branches: [ main ] | |
| paths: | |
| - 'training/**' | |
| - '.github/workflows/training_bootc.yaml' | |
| workflow_dispatch: | |
| concurrency: | |
| group: ${{ github.workflow }} | |
| cancel-in-progress: false | |
| env: | |
| REGISTRY: quay.io | |
| REGISTRY_ORG: ai-lab | |
| REGION: us-east-1 | |
| jobs: | |
| start-runner: | |
| name: Start self-hosted EC2 runner | |
| if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" | |
| runs-on: ubuntu-latest | |
| outputs: | |
| label: ${{ steps.start-ec2-runner.outputs.label }} | |
| ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
| steps: | |
| - name: Configure AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v1 | |
| with: | |
| aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| aws-region: ${{ env.REGION }} | |
| - name: Start EC2 runner | |
| id: start-ec2-runner | |
| uses: machulav/ec2-github-runner@v2 | |
| with: | |
| mode: start | |
| github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
| ec2-image-id: ami-0154957ba4ce98784 | |
| ec2-instance-type: m7i.12xlarge | |
| subnet-id: subnet-0b1e1d94240813658 | |
| security-group-id: sg-055105753f5e8bd83 | |
| nvidia-bootc-builder-image: | |
| if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" | |
| strategy: | |
| matrix: | |
| include: | |
| - image_name: nvidia-builder | |
| context: training/nvidia-bootc | |
| arch: amd64 | |
| runs-on: ${{ needs.start-runner.outputs.label }} | |
| needs: start-runner | |
| permissions: | |
| contents: read | |
| packages: write | |
| steps: | |
| - uses: actions/checkout@v4.1.7 | |
| - name: mkdir root/.docker directory | |
| run: | | |
| mkdir -p ~/.docker | |
| - name: Login to Container Registry | |
| run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }} | |
| - name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE | |
| run: | | |
| ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N "" | |
| - name: Build Image | |
| id: build_image | |
| run: make driver-toolkit ARCH=${{ matrix.arch }} | |
| working-directory: ${{ matrix.context }} | |
| - name: tag image as nvidia-builder | |
| run: podman tag ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}/driver-toolkit:latest ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }}/${{ matrix.image_name}}:latest | |
| - name: Push image | |
| if: github.event_name == 'push' && github.ref == 'refs/heads/main' | |
| uses: redhat-actions/push-to-registry@v2.8 | |
| with: | |
| registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} | |
| image: driver-toolkit | |
| tags: latest | |
| - name: push the nvidia-builder image | |
| if: github.event_name == 'push' && github.ref == 'refs/heads/main' | |
| uses: redhat-actions/push-to-registry@v2.8 | |
| with: | |
| image: ${{ matrix.image_name}} | |
| tags: latest | |
| registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} | |
| - name: Publish Job Results to Slack | |
| id: slack | |
| if: always() | |
| uses: slackapi/slack-github-action@v1.26.0 | |
| with: | |
| payload: | | |
| { | |
| "text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
| nvidia-bootc-image: | |
| strategy: | |
| matrix: | |
| include: | |
| - image_name: nvidia-bootc | |
| driver_version: "550.54.15" | |
| context: training/nvidia-bootc | |
| arch: amd64 | |
| runs-on: ${{ needs.start-runner.outputs.label }} | |
| if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" | |
| needs: [ nvidia-bootc-builder-image, start-runner ] | |
| steps: | |
| - uses: actions/checkout@v4.1.7 | |
| - name: mkdir root/.docker directory | |
| run: | | |
| mkdir -p ~/.docker | |
| - name: generate a ssh key - USER SHOULD INJECT THEIR OWN AND REBUILD IF THEY USE THIS IMAGE and overwrite the existing one | |
| run: | | |
| ssh-keygen -t rsa -b 4096 -f ~/.ssh/id_rsa -N "" <<<y | |
| - name: Login to Container Registry | |
| run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }} | |
| - name: generate the local OCI assets | |
| run: | | |
| cd training | |
| make -j vllm | |
| make -j deepspeed | |
| make -j instruct-nvidia | |
| - name: Build Image | |
| id: build_image | |
| run: make bootc DRIVER_VERSION=${{ matrix.driver_version }} ARCH=${{ matrix.arch }} SSH_PUBKEY=~/.ssh/id_rsa.pub | |
| working-directory: ${{ matrix.context }} | |
| - name: Push image | |
| if: github.event_name == 'push' && github.ref == 'refs/heads/main' | |
| uses: redhat-actions/push-to-registry@v2.8 | |
| with: | |
| image: ${{ matrix.image_name }} | |
| tags: latest | |
| registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} | |
| - name: Publish Job Results to Slack | |
| id: slack | |
| if: always() | |
| uses: slackapi/slack-github-action@v1.26.0 | |
| with: | |
| payload: | | |
| { | |
| "text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
| bootc-images: | |
| strategy: | |
| matrix: | |
| include: | |
| - image_name: intel-bootc | |
| context: training/intel-bootc | |
| arch: amd64 | |
| gpu: intel | |
| - image_name: amd-bootc | |
| context: training/amd-bootc | |
| arch: amd64 | |
| gpu: amd | |
| if: "!contains(github.event.pull_request.labels.*.name, 'hold-tests')" | |
| runs-on: ${{ needs.start-runner.outputs.label }} | |
| needs: [ start-runner, nvidia-bootc-builder-image ] | |
| continue-on-error: true | |
| steps: | |
| - uses: actions/checkout@v4.1.7 | |
| - name: mkdir root/.docker directory | |
| run: | | |
| mkdir -p ~/.docker | |
| - name: Login to Container Registry | |
| run: podman login -u ${{ secrets.REGISTRY_USER }} -p ${{ secrets.REGISTRY_PASSWORD }} ${{ env.REGISTRY }} | |
| - name: generate the local OCI assets | |
| run: | | |
| cd training | |
| make -j vllm | |
| make -j deepspeed | |
| make -j instruct-${{ matrix.gpu}} | |
| - name: Build Image | |
| id: build_image | |
| run: make bootc ARCH=${{ matrix.arch }} INSTRUCTLAB_IMAGE=${{env.REGISTRY}}/${{env.REGISTRY_ORG}}/instruct-${{ matrix.gpu }}:latest | |
| working-directory: ${{ matrix.context }} | |
| - name: Push image | |
| if: github.event_name == 'push' && github.ref == 'refs/heads/main' | |
| uses: redhat-actions/push-to-registry@v2.8 | |
| with: | |
| image: ${{ matrix.image_name }} | |
| tags: latest | |
| registry: ${{ env.REGISTRY }}/${{ env.REGISTRY_ORG }} | |
| - name: Publish Job Results to Slack | |
| id: slack | |
| if: always() | |
| uses: slackapi/slack-github-action@v1.26.0 | |
| with: | |
| payload: | | |
| { | |
| "text": "${{ github.workflow }} workflow status: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| } | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
| stop-runner: | |
| name: Stop self-hosted EC2 runner | |
| needs: | |
| - start-runner | |
| - bootc-images | |
| - nvidia-bootc-image | |
| runs-on: ubuntu-latest | |
| if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs | |
| steps: | |
| - name: Configure AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v1 | |
| with: | |
| aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| aws-region: ${{ secrets.AWS_REGION }} | |
| - name: Stop EC2 runner | |
| uses: machulav/ec2-github-runner@v2 | |
| with: | |
| mode: stop | |
| github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
| label: ${{ needs.start-runner.outputs.label }} | |
| ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |