# Copyright (c) 2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. name: CICD NeMo on: schedule: - cron: 0 0 * * * pull_request: branches: - main - r** - weekly-bump* types: [labeled] workflow_dispatch: inputs: test_to_run: required: false default: all type: string description: Comma-separated list of tests to run. Use "all" to run the full test suite. concurrency: # group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }}-${{ github.event_name }} group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} cancel-in-progress: true jobs: pre-flight: runs-on: ubuntu-latest outputs: test_to_run: ${{ steps.test_to_run.outputs.main }} is_ci_workload: ${{ steps.is_ci_workload.outputs.main }} no_fail_fast: ${{ steps.no_fail_fast.outputs.main }} components_to_run: ${{ steps.components_to_run.outputs.main }} env: TESTS_TO_RUN: ${{ inputs.test_to_run }} EVENT_NAME: ${{ github.event_name }} HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }} steps: - name: Checkout branch uses: actions/checkout@v4 with: fetch-depth: 0 - name: Select components to run id: components_to_run run: | pip install -U pip pip install git-python if [[ "$EVENT_NAME" == "pull_request" ]]; then python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }} else echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json fi components_to_run=$(cat test_modules.json) echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT" - name: Select tests to run id: test_to_run run: | # For manual dispatch, we replace `all` with the actual job names if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then TESTS_TO_RUN=$TESTS_TO_RUN # For correctly labeled PR, we replace `all` with the actual job names elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then TESTS_TO_RUN=all # For incorrectly labeled PR, run no tests elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then TESTS_TO_RUN="" # For push events, run all tests. This is so that we can generate coverage # on branch `main`. elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then TESTS_TO_RUN=all else echo "Unsupported event_name $EVENT_NAME provided". exit 1 fi parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")') echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT" - name: Check if this is a CI workload shell: bash id: is_ci_workload run: | branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then is_ci_workload=true echo "main=true" | tee -a "$GITHUB_OUTPUT" else is_ci_workload=false fi echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT" - name: Check if no-fail-fast is set shell: bash id: no_fail_fast env: HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }} run: | if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then no_fail_fast=true else no_fail_fast=false fi echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT" code-linting: if: needs.pre-flight.outputs.test_to_run != '[]' needs: [pre-flight] uses: ./.github/workflows/code-linting.yml cicd-wait-in-queue: needs: [pre-flight, code-linting] runs-on: ubuntu-latest environment: test if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.components_to_run != '[]' && needs.pre-flight.outputs.is_ci_workload == 'false' steps: - name: Running CI tests run: | echo "Running CI tests" cicd-test-container-build: uses: ./.github/workflows/_build_container.yml needs: [pre-flight, code-linting, cicd-wait-in-queue] if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.components_to_run != '[]' && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: image-name: nemo_container dockerfile: docker/Dockerfile.ci cicd-import-tests: if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.components_to_run != '[]' && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() needs: [cicd-test-container-build, pre-flight] runs-on: self-hosted-azure-gpus-1 steps: - name: Create UUID id: uuid run: | echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT" - name: Checkout NeMo uses: actions/checkout@v2 with: repository: NVIDIA/NeMo path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo - name: Run some checks run: | docker run \ --rm \ --device=/dev/nvidia0 \ --gpus all \ --shm-size=8g \ --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \ --env TRANSFORMERS_OFFLINE=0 \ --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\ # PyTorch Lightning version python -c "import lightning.pytorch; print(lightning.pytorch.__version__)" # PyTorch Lightning DDP Checks CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py" # Basic Import Checks python tests/core_ptl/check_imports.py --domain asr python tests/core_ptl/check_imports.py --domain nlp python tests/core_ptl/check_imports.py --domain tts ' L0_Setup_Test_Data_And_Models: needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue] runs-on: self-hosted-azure if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.components_to_run != '[]' && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() steps: - name: Checkout uses: actions/checkout@v4 with: path: ${{ github.run_id }} - name: main uses: NVIDIA/NeMo/.github/actions/test-template@main with: runner: ${{ runner.name }} script: L0_Setup_Test_Data_And_Models tests_to_run: '["L0_Setup_Test_Data_And_Models"]' cicd-main-unit-tests: needs: [pre-flight, cicd-test-container-build] uses: ./.github/workflows/cicd-main-unit-tests.yml if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.components_to_run != '[]' && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} cicd-main-export-deploy: needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] uses: ./.github/workflows/cicd-main-export-deploy.yml if: | ( needs.pre-flight.outputs.test_to_run != '[]' && ( contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy') ) ) && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} cicd-main-speech: needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] uses: ./.github/workflows/cicd-main-speech.yml if: | ( needs.pre-flight.outputs.test_to_run != '[]' && ( contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech') ) ) && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} cicd-main-automodel: needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] uses: ./.github/workflows/cicd-main-automodel.yml if: | ( needs.pre-flight.outputs.test_to_run != '[]' && ( contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel') ) ) && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} cicd-main-nemo2: needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests] uses: ./.github/workflows/cicd-main-nemo2.yml if: | ( needs.pre-flight.outputs.test_to_run != '[]' && ( contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2') || needs.pre-flight.outputs.components_to_run == '["all"]' ) ) && ( success() || ( needs.cicd-wait-in-queue.result == 'skipped' && needs.pre-flight.outputs.is_ci_workload == 'true' ) ) && !cancelled() with: test_to_run: ${{ needs.pre-flight.outputs.test_to_run }} Nemo_CICD_Test: needs: - pre-flight - cicd-test-container-build - cicd-import-tests - L0_Setup_Test_Data_And_Models - cicd-main-unit-tests - cicd-main-nemo2 - cicd-main-export-deploy - cicd-main-automodel - cicd-main-speech if: always() runs-on: ubuntu-latest permissions: write-all steps: - name: Checkout uses: actions/checkout@v4 - name: Get workflow result id: result env: GH_TOKEN: ${{ github.token }} RUN_ID: ${{ github.run_id }} HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }} IS_SCHEDULED: ${{ github.event_name == 'schedule' }} run: | # Get workflow run details and check job conclusions LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last') NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length') NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length') if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && ("$HAS_LABEL" == "true" || "$IS_SCHEDULED" == "true") ]]; then RESULT="success" elif [[ $NUM_CANCELLED -gt 0 ]]; then RESULT="cancelled" else RESULT="failure" fi # Output the final status echo "code=$RESULT" | tee -a $GITHUB_OUTPUT - name: Checkout for GH CLI uses: actions/checkout@v4 - name: Remove label if not cancelled if: | steps.result.outputs.code != 'cancelled' && github.event.label.name == 'Run CICD' && github.event.pull_request.head.repo.full_name == github.repository env: GH_TOKEN: ${{ github.token }} PR_NUMBER: ${{ github.event.number }} run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD" - name: Pipeline successful, add PR comment if: | steps.result.outputs.code == 'success' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' uses: peter-evans/create-or-update-comment@v4 env: SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} REPOSITORY: ${{ github.repository }} RUN_ID: ${{ github.run_id }} with: issue-number: ${{ github.event.number }} body: | [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋, We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully. So it might be time to merge this PR or get some approvals. //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary" if: | steps.result.outputs.code == 'failure' && github.event.label.name == 'Run CICD' && env.SLACK_WEBHOOK != '' env: SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPOSITORY: ${{ github.repository }} RUN_ID: ${{ github.run_id }} PR_NUMBER: ${{ github.event.number }} SERVER_URL: ${{ github.server_url }} run: | set -x pip install PyGithub export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}} python .github/scripts/notify.py - name: Exit if: ${{ always() }} env: RESULT: ${{ steps.result.outputs.code }} run: | if [ $RESULT == "success" ]; then exit 0 else exit 1 fi Coverage: runs-on: ubuntu-latest needs: [pre-flight, Nemo_CICD_Test] if: | needs.pre-flight.outputs.test_to_run != '[]' && needs.pre-flight.outputs.components_to_run != '[]' && ( success() || needs.Nemo_CICD_Test.result == 'success' ) && !cancelled() strategy: matrix: flag: [unit-test, e2e] steps: - name: Checkout uses: actions/checkout@v4 - name: Download coverage reports of current branch uses: actions/download-artifact@v4 with: pattern: coverage-${{ matrix.flag }}-* - name: Get total coverage of current branch shell: bash -x -e -u -o pipefail {0} if: always() run: | pip install coverage ls -al . ls -al coverage-*/ coverage combine --keep $(ls coverage-*/.coverage) coverage report -i rm -rf coverage-* ls -al - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} verbose: true flags: ${{ matrix.flag }} - name: Upload artifacts uses: actions/upload-artifact@v4 with: name: coverage-${{ matrix.flag }}-aggregated path: | .coverage include-hidden-files: true