| # Regularly updates the CI container | |
| name: Reboots VMs in a controlled way | |
| on: | |
| schedule: | |
| - cron: 0/15 * * * * | |
| workflow_dispatch: | |
| jobs: | |
| pre-flight: | |
| runs-on: ubuntu-latest | |
| if: github.repository_owner == 'NVIDIA' | |
| outputs: | |
| list-of-vms: ${{ steps.main.outputs.main }} | |
| environment: main | |
| steps: | |
| - name: Get list of VMs | |
| id: main | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.PAT }} | |
| run: | | |
| RUNNERS=$(curl -L \ | |
| -H "Accept: application/vnd.github+json" \ | |
| -H "Authorization: Bearer $GITHUB_TOKEN" \ | |
| -H "X-GitHub-Api-Version: 2022-11-28" \ | |
| https://api.github.com/repos/NVIDIA/NeMo/actions/runners) | |
| MATRIX=$(echo $RUNNERS \ | |
| | jq -c '[ | |
| .runners[] | |
| | select(.status == "online") | |
| | select(.name | contains("cpu") | not) | |
| | { | |
| "vm": .name, | |
| "n_gpus": [ | |
| .labels[] | |
| | select(.name | endswith("gpu")) | .name | |
| ][0][:1] | |
| } | |
| ] | |
| ' | |
| ) | |
| echo main=$MATRIX | tee -a "$GITHUB_OUTPUT" | |
| maintenance: | |
| needs: pre-flight | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}} | |
| uses: ./.github/workflows/monitor-single-vm.yml | |
| with: | |
| vm: ${{ matrix.vm }} | |
| n_gpus: ${{ matrix.n_gpus }} | |
| secrets: inherit # pragma: allowlist secret | |