Respair commited on
Commit
b386992
·
verified ·
1 Parent(s): 6dfbe17

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +91 -0
  2. .github/CODEOWNERS +4 -0
  3. .github/ISSUE_TEMPLATE/bug_report.md +42 -0
  4. .github/ISSUE_TEMPLATE/dev_container_bug_report.md +35 -0
  5. .github/ISSUE_TEMPLATE/feature_request.md +25 -0
  6. .github/PULL_REQUEST_TEMPLATE.md +57 -0
  7. .github/actions/cancel-workflow/action.yml +25 -0
  8. .github/actions/test-template/action.yml +228 -0
  9. .github/labeler.yml +55 -0
  10. .github/scripts/__init__.py +0 -0
  11. .github/scripts/components_to_run.py +84 -0
  12. .github/scripts/nemo_dependencies.py +400 -0
  13. .github/scripts/notify.py +79 -0
  14. .github/workflows/_build_container.yml +89 -0
  15. .github/workflows/_bump_mcore_tag.yml +56 -0
  16. .github/workflows/build-test-publish-wheel.yml +38 -0
  17. .github/workflows/changelog-build.yml +73 -0
  18. .github/workflows/cherry-pick-release-commit.yml +14 -0
  19. .github/workflows/cicd-approve-test-queue.yml +175 -0
  20. .github/workflows/cicd-main-automodel.yml +137 -0
  21. .github/workflows/cicd-main-export-deploy.yml +114 -0
  22. .github/workflows/cicd-main-nemo2.yml +302 -0
  23. .github/workflows/cicd-main-speech.yml +198 -0
  24. .github/workflows/cicd-main-testcopy.yml +472 -0
  25. .github/workflows/cicd-main-unit-tests.yml +212 -0
  26. .github/workflows/cicd-main.yml +496 -0
  27. .github/workflows/cicd-relabel-bot.yml +36 -0
  28. .github/workflows/close-inactive-issue-pr.yml +25 -0
  29. .github/workflows/code-formatting.yml +73 -0
  30. .github/workflows/code-init-file-checker.yml +23 -0
  31. .github/workflows/code-linting.yml +159 -0
  32. .github/workflows/codeql.yml +75 -0
  33. .github/workflows/config/changelog-config.json +134 -0
  34. .github/workflows/config/codeql.yml +9 -0
  35. .github/workflows/copyright-check.yml +22 -0
  36. .github/workflows/gh-docs.yml +81 -0
  37. .github/workflows/install-test.yml +127 -0
  38. .github/workflows/labeler.yml +14 -0
  39. .github/workflows/mcore-tag-bump-bot.yml +62 -0
  40. .github/workflows/monitor-single-vm.yml +54 -0
  41. .github/workflows/monitor-vms.yml +54 -0
  42. .github/workflows/release-freeze.yml +85 -0
  43. .github/workflows/release.yml +48 -0
  44. .github/workflows/secrets-detector.yml +43 -0
  45. .github/workflows/update-buildcache.yml +112 -0
  46. README.md +544 -0
  47. canary_results/canary-small/checkpoints/canary-small.nemo +3 -0
  48. canary_results/canary-small/cmd-args.log +1 -0
  49. canary_results/canary-small/git-info.log +0 -0
  50. canary_results/canary-small/lightning_logs.txt +21 -0
.gitattributes CHANGED
@@ -33,3 +33,94 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ canary_results/canary-small/checkpoints/canary-small.nemo filter=lfs diff=lfs merge=lfs -text
37
+ data/tsukasa_manifest.json filter=lfs diff=lfs merge=lfs -text
38
+ data/tsukasa_manifest.lst filter=lfs diff=lfs merge=lfs -text
39
+ data/tsukasa_train.json filter=lfs diff=lfs merge=lfs -text
40
+ docs/source/asr/images/citrinet_vertical.png filter=lfs diff=lfs merge=lfs -text
41
+ docs/source/asr/images/conf-ensembles-overview.png filter=lfs diff=lfs merge=lfs -text
42
+ docs/source/asr/images/conformer_ctc.png filter=lfs diff=lfs merge=lfs -text
43
+ docs/source/asr/images/hat.png filter=lfs diff=lfs merge=lfs -text
44
+ docs/source/asr/images/hybrid_asr_tts_model.png filter=lfs diff=lfs merge=lfs -text
45
+ docs/source/asr/images/jasper_vertical.png filter=lfs diff=lfs merge=lfs -text
46
+ docs/source/asr/images/quartz_vertical.png filter=lfs diff=lfs merge=lfs -text
47
+ docs/source/asr/images/squeezeformer.png filter=lfs diff=lfs merge=lfs -text
48
+ docs/source/asr/speaker_diarization/images/asr_sd_diagram.png filter=lfs diff=lfs merge=lfs -text
49
+ docs/source/asr/speaker_diarization/images/ats.png filter=lfs diff=lfs merge=lfs -text
50
+ docs/source/asr/speaker_diarization/images/data_flow.png filter=lfs diff=lfs merge=lfs -text
51
+ docs/source/asr/speaker_diarization/images/e2e_and_cascaded_diar_systems.png filter=lfs diff=lfs merge=lfs -text
52
+ docs/source/asr/speaker_diarization/images/intro_comparison.png filter=lfs diff=lfs merge=lfs -text
53
+ docs/source/asr/speaker_diarization/images/loss_types.png filter=lfs diff=lfs merge=lfs -text
54
+ docs/source/asr/speaker_diarization/images/main_dataflow.png filter=lfs diff=lfs merge=lfs -text
55
+ docs/source/asr/speaker_diarization/images/ms_trade_off.png filter=lfs diff=lfs merge=lfs -text
56
+ docs/source/asr/speaker_diarization/images/msdd_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
57
+ docs/source/asr/speaker_diarization/images/scale_weight_cnn.png filter=lfs diff=lfs merge=lfs -text
58
+ docs/source/asr/speaker_diarization/images/sortformer.png filter=lfs diff=lfs merge=lfs -text
59
+ docs/source/asr/speaker_diarization/images/weighted_sum.png filter=lfs diff=lfs merge=lfs -text
60
+ docs/source/asr/speaker_recognition/images/ICASPP_SpeakerNet.png filter=lfs diff=lfs merge=lfs -text
61
+ docs/source/asr/speaker_recognition/images/titanet_network.png filter=lfs diff=lfs merge=lfs -text
62
+ docs/source/asr/speech_classification/images/marblenet_vertical.png filter=lfs diff=lfs merge=lfs -text
63
+ docs/source/asr/speech_classification/images/matchboxnet_vertical.png filter=lfs diff=lfs merge=lfs -text
64
+ docs/source/asr/speech_intent_slot/images/example.png filter=lfs diff=lfs merge=lfs -text
65
+ docs/source/core/whyntypes.gif filter=lfs diff=lfs merge=lfs -text
66
+ docs/source/multimodal/mllm/images/llava_arch.jpg filter=lfs diff=lfs merge=lfs -text
67
+ docs/source/multimodal/nerf/images/dreamfusion_model_overview.png filter=lfs diff=lfs merge=lfs -text
68
+ docs/source/multimodal/text2img/images/imagen_arch.png filter=lfs diff=lfs merge=lfs -text
69
+ docs/source/multimodal/vlm/images/clip_arch.png filter=lfs diff=lfs merge=lfs -text
70
+ docs/source/nlp/entity_linking_overview.jpg filter=lfs diff=lfs merge=lfs -text
71
+ docs/source/nlp/nemo_megatron/customization_forward.png filter=lfs diff=lfs merge=lfs -text
72
+ docs/source/nlp/nemo_megatron/customization_module.png filter=lfs diff=lfs merge=lfs -text
73
+ docs/source/nlp/nemo_megatron/images/ddp.gif filter=lfs diff=lfs merge=lfs -text
74
+ docs/source/nlp/nemo_megatron/images/pnom.gif filter=lfs diff=lfs merge=lfs -text
75
+ docs/source/nlp/nemo_megatron/images/pp.gif filter=lfs diff=lfs merge=lfs -text
76
+ docs/source/nlp/nemo_megatron/images/pp_comm_overlap.png filter=lfs diff=lfs merge=lfs -text
77
+ docs/source/nlp/nemo_megatron/images/tp1.png filter=lfs diff=lfs merge=lfs -text
78
+ docs/source/nlp/nemo_megatron/images/tp2.png filter=lfs diff=lfs merge=lfs -text
79
+ docs/source/nlp/nemo_megatron/images/tp_comm_overlap.png filter=lfs diff=lfs merge=lfs -text
80
+ docs/source/tools/images/scrsh_2.png filter=lfs diff=lfs merge=lfs -text
81
+ docs/source/tools/images/scrsh_9.png filter=lfs diff=lfs merge=lfs -text
82
+ docs/source/tools/images/sde_mls_player.png filter=lfs diff=lfs merge=lfs -text
83
+ docs/source/tools/images/sde_player.png filter=lfs diff=lfs merge=lfs -text
84
+ docs/source/tools/images/sde_samples.png filter=lfs diff=lfs merge=lfs -text
85
+ docs/source/tts/images/audiocodec_model.png filter=lfs diff=lfs merge=lfs -text
86
+ docs/source/tts/images/data_labeling_pipeline.png filter=lfs diff=lfs merge=lfs -text
87
+ docs/source/tts/images/fastpitch_model.png filter=lfs diff=lfs merge=lfs -text
88
+ docs/source/tts/images/hifigan_d_model.png filter=lfs diff=lfs merge=lfs -text
89
+ docs/source/tts/images/hifigan_g_model.png filter=lfs diff=lfs merge=lfs -text
90
+ docs/source/tts/images/mixertts_model.png filter=lfs diff=lfs merge=lfs -text
91
+ docs/source/tts/images/radaligner_model.png filter=lfs diff=lfs merge=lfs -text
92
+ docs/source/tts/images/radtts_model.png filter=lfs diff=lfs merge=lfs -text
93
+ docs/source/tts/images/tacotron2_model.png filter=lfs diff=lfs merge=lfs -text
94
+ docs/source/tts/images/univnet_model.png filter=lfs diff=lfs merge=lfs -text
95
+ docs/source/tts/images/waveglow_model.png filter=lfs diff=lfs merge=lfs -text
96
+ docs/source/vision/images/vit_arch.png filter=lfs diff=lfs merge=lfs -text
97
+ nemo/collections/diffusion/assets/mixed_training.png filter=lfs diff=lfs merge=lfs -text
98
+ nemo/collections/diffusion/assets/pipeline_conditioning.png filter=lfs diff=lfs merge=lfs -text
99
+ nemo/collections/diffusion/assets/st_dit_hybrid_parallel.png filter=lfs diff=lfs merge=lfs -text
100
+ tools/speech_data_explorer/screenshot.png filter=lfs diff=lfs merge=lfs -text
101
+ tools/speech_data_simulator/pictures/audio_session.png filter=lfs diff=lfs merge=lfs -text
102
+ tutorials/asr/images/multilang_asr_inference.png filter=lfs diff=lfs merge=lfs -text
103
+ tutorials/asr/images/multilang_asr_train.png filter=lfs diff=lfs merge=lfs -text
104
+ tutorials/asr/images/promptformat.png filter=lfs diff=lfs merge=lfs -text
105
+ tutorials/asr/images/test_wer_wandb.png filter=lfs diff=lfs merge=lfs -text
106
+ tutorials/asr/images/tokenizer.png filter=lfs diff=lfs merge=lfs -text
107
+ tutorials/llm/llama/biomedical-qa/img/e2e-lora-train-and-deploy.png filter=lfs diff=lfs merge=lfs -text
108
+ tutorials/llm/llama/domain-adaptive-pretraining/code/imgs/tokenization_diagram.png filter=lfs diff=lfs merge=lfs -text
109
+ tutorials/llm/llama/sdg-law-title-generation/img/e2e-lora-train-and-deploy.png filter=lfs diff=lfs merge=lfs -text
110
+ tutorials/multimodal/images/LITA_arch.png filter=lfs diff=lfs merge=lfs -text
111
+ tutorials/nlp/images/prompt_learning_forward_pass.png filter=lfs diff=lfs merge=lfs -text
112
+ tutorials/nlp/images/spellmapper_inference_pipeline.png filter=lfs diff=lfs merge=lfs -text
113
+ tutorials/speaker_tasks/images/affinity_matrix_fusion.png filter=lfs diff=lfs merge=lfs -text
114
+ tutorials/speaker_tasks/images/ats.png filter=lfs diff=lfs merge=lfs -text
115
+ tutorials/speaker_tasks/images/cascaded_diar_diagram.png filter=lfs diff=lfs merge=lfs -text
116
+ tutorials/speaker_tasks/images/intro_comparison.png filter=lfs diff=lfs merge=lfs -text
117
+ tutorials/speaker_tasks/images/loss_types.png filter=lfs diff=lfs merge=lfs -text
118
+ tutorials/speaker_tasks/images/main_dataflow.png filter=lfs diff=lfs merge=lfs -text
119
+ tutorials/speaker_tasks/images/msdd_inputs.png filter=lfs diff=lfs merge=lfs -text
120
+ tutorials/speaker_tasks/images/msdd_output_loss.png filter=lfs diff=lfs merge=lfs -text
121
+ tutorials/speaker_tasks/images/msdd_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
122
+ tutorials/speaker_tasks/images/multiscale_example.png filter=lfs diff=lfs merge=lfs -text
123
+ tutorials/speaker_tasks/images/sortformer.png filter=lfs diff=lfs merge=lfs -text
124
+ tutorials/tts/audio_samples/new_dict_entry.wav filter=lfs diff=lfs merge=lfs -text
125
+ tutorials/tts/audio_samples/phonemes_as_input.wav filter=lfs diff=lfs merge=lfs -text
126
+ tutorials/tts/images/tacotron2_diagram.png filter=lfs diff=lfs merge=lfs -text
.github/CODEOWNERS ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .github/ @pablo-garay @ko3n1g @thomasdhc @chtruong814
2
+ docker/Dockerfile.ci @pablo-garay @ko3n1g @thomasdhc @chtruong814
3
+ .pylintrc.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
4
+ .flake8.* @pablo-garay @ko3n1g @thomasdhc @chtruong814
.github/ISSUE_TEMPLATE/bug_report.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve
4
+ title: ''
5
+ labels: bug
6
+ assignees: ''
7
+
8
+ ---
9
+
10
+ **Describe the bug**
11
+
12
+ A clear and concise description of what the bug is.
13
+
14
+ **Steps/Code to reproduce bug**
15
+
16
+ Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
17
+
18
+ A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
19
+
20
+
21
+ **Expected behavior**
22
+
23
+ A clear and concise description of what you expected to happen.
24
+
25
+ **Environment overview (please complete the following information)**
26
+
27
+ - Environment location: [Bare-metal, Docker, Cloud(specify cloud provider - AWS, Azure, GCP, Collab)]
28
+ - Method of NeMo install: [pip install or from source]. Please specify exact commands you used to install.
29
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
30
+
31
+ **Environment details**
32
+
33
+ If NVIDIA docker image is used you don't need to specify these.
34
+ Otherwise, please provide:
35
+ - OS version
36
+ - PyTorch version
37
+ - Python version
38
+
39
+ **Additional context**
40
+
41
+ Add any other context about the problem here.
42
+ Example: GPU model
.github/ISSUE_TEMPLATE/dev_container_bug_report.md ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ container pulled on date: mm/dd/yyyy
3
+ name: Dev container - Bug report
4
+ about: Create a report to help us improve
5
+ title: ''
6
+ labels: bug
7
+ assignees: ''
8
+
9
+ ---
10
+
11
+ **Describe the bug**
12
+
13
+ A clear and concise description of what the bug is.
14
+
15
+ **Steps/Code to reproduce bug**
16
+
17
+ Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
18
+
19
+ A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
20
+
21
+
22
+ **Expected behavior**
23
+
24
+ A clear and concise description of what you expected to happen.
25
+
26
+ **Environment overview (please complete the following information)**
27
+
28
+ - Environment location: Docker
29
+ - Method of install: Please specify exact commands you used to install.
30
+ - If method of install is [Docker], provide `docker pull` & `docker run` commands used
31
+
32
+ **Additional context**
33
+
34
+ Add any other context about the problem here.
35
+ Example: GPU model
.github/ISSUE_TEMPLATE/feature_request.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: Feature request
3
+ about: Suggest an idea for this project
4
+ title: ''
5
+ labels: feature request
6
+ assignees: okuchaiev
7
+
8
+ ---
9
+
10
+ **Is your feature request related to a problem? Please describe.**
11
+
12
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
13
+
14
+ **Describe the solution you'd like**
15
+
16
+ A clear and concise description of what you want to happen.
17
+ Provide a code snippet on how new APIs/changes would be used by others.
18
+
19
+ **Describe alternatives you've considered**
20
+
21
+ A clear and concise description of any alternative solutions or features you've considered.
22
+
23
+ **Additional context**
24
+
25
+ Add any other context or screenshots about the feature request here.
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > [!IMPORTANT]
2
+ > The `Update branch` button must only be pressed in very rare occassions.
3
+ > An outdated branch is never blocking the merge of a PR.
4
+ > Please reach out to the automation team before pressing that button.
5
+
6
+ # What does this PR do ?
7
+
8
+ Add a one line overview of what this PR aims to accomplish.
9
+
10
+ **Collection**: [Note which collection this PR will affect]
11
+
12
+ # Changelog
13
+
14
+ - Add specific line by line info of high level changes in this PR.
15
+
16
+ # Usage
17
+
18
+ - You can potentially add a usage example below
19
+
20
+ ```python
21
+ # Add a code snippet demonstrating how to use this
22
+ ```
23
+
24
+ # GitHub Actions CI
25
+
26
+ The Jenkins CI system has been replaced by GitHub Actions self-hosted runners.
27
+
28
+ The GitHub Actions CI will run automatically when the "Run CICD" label is added to the PR.
29
+ To re-run CI remove and add the label again.
30
+ To run CI on an untrusted fork, a NeMo user with write access must first click "Approve and run".
31
+
32
+ # Before your PR is "Ready for review"
33
+
34
+ **Pre checks**:
35
+
36
+ - [ ] Make sure you read and followed [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md)
37
+ - [ ] Did you write any new necessary tests?
38
+ - [ ] Did you add or update any necessary documentation?
39
+ - [ ] Does the PR affect components that are optional to install? (Ex: Numba, Pynini, Apex etc)
40
+ - [ ] Reviewer: Does the PR have correct import guards for all optional libraries?
41
+
42
+ **PR Type**:
43
+
44
+ - [ ] New Feature
45
+ - [ ] Bugfix
46
+ - [ ] Documentation
47
+
48
+ If you haven't finished some of the above items you can still open "Draft" PR.
49
+
50
+ ## Who can review?
51
+
52
+ Anyone in the community is free to review the PR once the checks have passed.
53
+ [Contributor guidelines](https://github.com/NVIDIA/NeMo/blob/main/CONTRIBUTING.md) contains specific people who can review PRs to various areas.
54
+
55
+ # Additional Information
56
+
57
+ - Related to # (issue)
.github/actions/cancel-workflow/action.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Cancel Workflow
2
+ description: >
3
+ Cancels the current workflow run, i.e. all jobs. Useful if you want to cancel the rest of the workflow when one job
4
+ fails. Note that this will cause the workflow to appear cancelled, not failed.
5
+
6
+ # Cancelling the workflow in a post-script (like this:
7
+ # https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runspost; can also be done with
8
+ # this action: https://github.com/webiny/action-post-run, see Git history of this file) wouldn't help the status, it
9
+ # would still be cancelled. It actually indeed is, but it would be nicer to set it to failed, but there seems to be no
10
+ # way to do this.
11
+
12
+ runs:
13
+ using: "composite"
14
+ steps:
15
+ - name: Cancel Workflow
16
+ # # Fork PRs won't have a token with write access to Actions, thus won't be able to cancel the workflow.
17
+ # if: github.event.pull_request == '' || github.event.pull_request.head.repo.fork == false
18
+ shell: bash
19
+ run: |
20
+ curl --verbose \
21
+ -X POST \
22
+ -H "Accept: application/vnd.github+json" \
23
+ -H "Authorization: Bearer ${{ github.token }}" \
24
+ -H "X-GitHub-Api-Version: 2022-11-28" \
25
+ https://api.github.com/repos/${{ github.repository }}/actions/runs/${{ github.run_id }}/cancel
.github/actions/test-template/action.yml ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: "Test Template"
15
+ description: "Template for running NeMo tests in a containerized environment"
16
+
17
+ inputs:
18
+ runner:
19
+ description: "Runner to use for test"
20
+ required: true
21
+ timeout:
22
+ description: "Max runtime of test in minutes"
23
+ required: false
24
+ default: "10"
25
+ script:
26
+ description: "Test script to execute"
27
+ required: true
28
+ after_script:
29
+ description: "Script to run after main test"
30
+ required: false
31
+ default: ":"
32
+ is_optional:
33
+ description: "Failure will cancel all other tests if set to true"
34
+ required: false
35
+ default: "false"
36
+ is_unit_test:
37
+ description: "Upload coverage as unit test"
38
+ required: false
39
+ default: "false"
40
+ tests_to_run:
41
+ description: "Tests to run"
42
+ required: false
43
+ default: '["all"]'
44
+ image:
45
+ description: "Image to use for test"
46
+ required: false
47
+ default: "nemo_container"
48
+ cpu-only:
49
+ description: "Run tests on CPU only"
50
+ required: false
51
+ default: "false"
52
+ runs:
53
+ using: "composite"
54
+ steps:
55
+ - name: Noop
56
+ shell: bash
57
+ run: |
58
+ chmod -R u+rwX ${{ github.run_id }}
59
+ echo "noop"
60
+
61
+ - name: Docker system cleanup
62
+ shell: bash
63
+ run: |
64
+ docker system prune -af --filter "until=24h" --filter "label!=nemo.pr_number=${{ github.event.pull_request.number || 0 }}" --force || true
65
+
66
+ - name: Docker pull image
67
+ shell: bash
68
+ run: |
69
+ docker pull nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }}
70
+
71
+ - name: Clean repos
72
+ shell: bash
73
+ run: |
74
+
75
+ - name: Create UUID
76
+ id: uuid
77
+ shell: bash
78
+ run: |
79
+ echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
80
+
81
+ - name: Checkout NeMo
82
+ uses: actions/checkout@v2
83
+ env:
84
+ DIR: ${{ github.run_id }}
85
+ with:
86
+ repository: NVIDIA/NeMo
87
+ path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
88
+
89
+ - name: Start container
90
+ shell: bash
91
+ env:
92
+ DIR: ${{ github.run_id }}
93
+ run: |
94
+ mkdir -p $DIR
95
+
96
+ # Map of runner names to GPU device configurations
97
+ declare -A GPU_CONFIGS=(
98
+ ["myVm-01"]="0,1"
99
+ ["myVm-02"]="2,3"
100
+ ["myVm-03"]="4,5"
101
+ ["myVm-04"]="6,7"
102
+ )
103
+
104
+ ARG=("")
105
+ if [[ "${{ inputs.cpu-only }}" == "false" ]]; then
106
+ ARG=("--runtime=nvidia --gpus all")
107
+ fi
108
+
109
+ cmd=$(cat <<RUN_TEST_EOF
110
+ #!/bin/bash
111
+ docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true
112
+ docker run \
113
+ --rm \
114
+ -d \
115
+ --name nemo_container_${{ github.run_id }}_${{ inputs.runner }} ${ARG[@]} \
116
+ --shm-size=64g \
117
+ --env TRANSFORMERS_OFFLINE=0 \
118
+ --env HYDRA_FULL_ERROR=1 \
119
+ --env HF_HOME=/home/TestData/HF_HOME \
120
+ --env RUN_ID=${{ github.run_id }} \
121
+ --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
122
+ --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/${{ inputs.image }}:${{ github.run_id }} \
123
+ bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
124
+ RUN_TEST_EOF
125
+ )
126
+
127
+ echo "$cmd" | tee "$DIR/retry_job.sh"
128
+ bash $DIR/retry_job.sh
129
+
130
+ - name: Create run-script
131
+ id: create
132
+ env:
133
+ DIR: ${{ github.run_id }}
134
+ shell: bash
135
+ run: |
136
+ COVERAGE_PREFIX=$([[ "${{ inputs.is_unit_test }}" == "true" ]] && echo "unit-test" || echo "e2e")
137
+ echo "coverage-prefix=$COVERAGE_PREFIX" | tee -a "$GITHUB_OUTPUT"
138
+
139
+ mkdir -p $DIR
140
+ rm $DIR/.coverage || true
141
+ rm $DIR/err.log || true
142
+
143
+ cmd=$(cat <<RUN_TEST_EOF
144
+ #!/bin/bash
145
+
146
+ (
147
+ set -e
148
+
149
+ docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c '\
150
+ cp -r /opt/Megatron-LM/ /workspace/ && \
151
+ bash tests/functional_tests/${{ inputs.script }}.sh && \
152
+ echo "Finished successfully." || echo "Did not finish."'
153
+ ) 2>&1 | tee $DIR/err.log
154
+
155
+ RUN_TEST_EOF
156
+ )
157
+
158
+ echo "timeout_in_seconds=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
159
+ echo "$cmd" | tee "$DIR/job.sh"
160
+
161
+ - name: Run main script
162
+ uses: nick-fields/retry@v3
163
+ with:
164
+ timeout_seconds: ${{ steps.create.outputs.timeout_in_seconds }}
165
+ max_attempts: 3
166
+ shell: bash
167
+ retry_on: timeout
168
+ command: /bin/bash ${{ github.run_id }}/job.sh
169
+ on_retry_command: /bin/bash ${{ github.run_id }}/retry_job.sh
170
+
171
+ - name: Check result
172
+ id: check
173
+ shell: bash
174
+ env:
175
+ DIR: ${{ github.run_id }}
176
+ run: |
177
+ cat $DIR/err.log
178
+
179
+ log=$(tail -c 2000 $DIR/err.log | base64 -w 0)
180
+ echo "log=$log" >> "$GITHUB_OUTPUT"
181
+
182
+ potential_infra_failure=$(cat $DIR/err.log | grep -Eqiw "device" && echo true || echo false)
183
+ echo "potential_infra_failure=$potential_infra_failure" >> "$GITHUB_OUTPUT"
184
+
185
+ docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage combine
186
+ docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage xml
187
+ docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/.coverage $DIR/.coverage
188
+ docker cp nemo_container_${{ github.run_id }}_${{ inputs.runner }}:/workspace/coverage.xml $DIR/coverage.xml
189
+
190
+ coverage_report=coverage-${{ steps.create.outputs.coverage-prefix }}-${{ github.run_id }}-$(uuidgen)
191
+ echo "coverage_report=$coverage_report" >> "$GITHUB_OUTPUT"
192
+
193
+ IS_SUCCESS=$(tail -n 1 $DIR/err.log | grep -q "Finished successfully." && echo "true" || echo "false")
194
+
195
+ if [[ "$IS_SUCCESS" == "false" && "${{ inputs.is_optional }}" == "true" ]]; then
196
+ echo "::warning:: Test failed, but displayed as successful because it is marked as optional."
197
+ IS_SUCCESS=true
198
+ fi
199
+
200
+ if [[ "$IS_SUCCESS" == "false" ]]; then
201
+ echo Test did not finish successfully.
202
+ exit 1
203
+ fi
204
+
205
+ exit $EXIT_CODE
206
+
207
+ - name: Test coverage
208
+ shell: bash -x -e -u -o pipefail {0}
209
+ run: |
210
+ docker exec -t nemo_container_${{ github.run_id }}_${{ inputs.runner }} coverage report -i
211
+
212
+ - name: Upload artifacts
213
+ uses: actions/upload-artifact@v4
214
+ if: ${{ steps.check.outputs.coverage_report != 'none' }}
215
+ with:
216
+ name: ${{ steps.check.outputs.coverage_report }}
217
+ path: |
218
+ ${{ github.run_id }}/coverage.xml
219
+ ${{ github.run_id }}/.coverage
220
+ include-hidden-files: true
221
+
222
+ - name: Container shutdown
223
+ if: always()
224
+ shell: bash
225
+ run: |
226
+ docker exec nemo_container_${{ github.run_id }}_${{ inputs.runner }} bash -c "chown -R $(id -u):$(id -g) /workspace"
227
+ rm -rf $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }} || true
228
+ docker container rm -f nemo_container_${{ github.run_id }}_${{ inputs.runner }} || true
.github/labeler.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ASR:
2
+ - nemo/collections/asr/**/*
3
+ - examples/asr/**/*
4
+ - tutorials/asr/**/*
5
+ - docs/source/asr/**/*
6
+ - tests/collections/asr/**
7
+
8
+ NLP:
9
+ - nemo/collections/nlp/**/*
10
+ - examples/nlp/**/*
11
+ - tutorials/nlp/**/*
12
+ - docs/source/nlp/**/*
13
+ - tests/collections/nlp/**
14
+
15
+ Multi Modal:
16
+ - nemo/collections/multimodal/**/*
17
+ - examples/multimodal/**/*
18
+ - tutorials/multimodal/**/*
19
+ - docs/source/multimodal/**/*
20
+ - tests/collections/multimodal/**
21
+
22
+ Speaker Tasks:
23
+ - examples/speaker_tasks/**/*
24
+ - tutorials/speaker_tasks/**/*
25
+
26
+ TTS:
27
+ - nemo/collections/tts/**/*
28
+ - nemo/collections/common/tokenizers/text_to_speech/**
29
+ - examples/tts/**/*
30
+ - tutorials/tts/**/*
31
+ - docs/source/tts/**/*
32
+ - scripts/dataset_processing/tts/**
33
+ - scripts/tts_dataset_files/**
34
+ - tests/collections/tts/**
35
+ - tests/collections/common/tokenizers/text_to_speech/**
36
+
37
+ Audio:
38
+ - nemo/collections/audio/**/*
39
+ - examples/audio/**/*
40
+ - tutorials/audio/**/*
41
+ - docs/source/audio/**/*
42
+ - tests/collections/audio/**
43
+
44
+ core:
45
+ - nemo/core/**/*
46
+ - tests/core/**
47
+
48
+ common:
49
+ - nemo/collections/common/**/*
50
+
51
+ CI:
52
+ - .github/**/*
53
+ - Jenkinsfile
54
+ - Dockerfile
55
+ - ci.groovy
.github/scripts/__init__.py ADDED
File without changes
.github/scripts/components_to_run.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ #!/usr/bin/env python3
16
+ import json
17
+ import os
18
+ import sys
19
+ from typing import Any, Dict, List, Set
20
+
21
+ import click
22
+ import git
23
+
24
+ import nemo_dependencies
25
+
26
+
27
+ def get_changed_files(source_sha: str, target_sha: str) -> List[str]:
28
+ """
29
+ Fetch the changelog between current branch and main.
30
+ Returns a list of dictionaries containing commit information.
31
+ """
32
+ try:
33
+ # Initialize the repo object - go up two levels from this file's location
34
+ repo = git.Repo(os.path.join(os.path.dirname(__file__), "..", ".."))
35
+
36
+ # Get the diff between target and source
37
+ diff_index = repo.commit(target_sha).diff(repo.commit(source_sha))
38
+
39
+ # Get just the changed filenames
40
+ changed_files = []
41
+ for diff in diff_index:
42
+ changed_files.append(diff.a_path if diff.a_path else diff.b_path)
43
+
44
+ return changed_files
45
+
46
+ except git.exc.GitCommandError as e:
47
+ print(f"Error fetching changelog: {e}", file=sys.stderr)
48
+ sys.exit(1)
49
+ except Exception as e:
50
+ print(f"Unexpected error: {e}", file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+
54
+ @click.command()
55
+ @click.option('--source-sha', type=str, required=True, help='Source commit SHA')
56
+ @click.option('--target-sha', type=str, required=True, help='Target commit sha')
57
+ def main(source_sha: str, target_sha: str):
58
+ """
59
+ Main function to fetch and output the changelog and changed files.
60
+ """
61
+
62
+ # Output unique changed files
63
+ print("\nChanged files:")
64
+ changed_files = get_changed_files(source_sha, target_sha)
65
+
66
+ print(json.dumps(sorted(list(changed_files)), indent=2))
67
+
68
+ nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
69
+ # Build dependency graph
70
+ dependencies = nemo_dependencies.build_dependency_graph(nemo_root)
71
+
72
+ test_modules: List[str] = []
73
+ for changed_file in changed_files:
74
+ if changed_file in dependencies:
75
+ test_modules.extend(dependencies[changed_file])
76
+
77
+ test_modules = list(set(test_modules))
78
+
79
+ with open("test_modules.json", "w", encoding="utf-8") as f:
80
+ json.dump(test_modules, f)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
.github/scripts/nemo_dependencies.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ NeMo dependency structure definition.
18
+ This module analyzes the codebase to determine internal dependencies between NeMo collections and core components.
19
+ """
20
+
21
+ import ast
22
+ import json
23
+ import os
24
+ from typing import Dict, List, Set
25
+
26
+
27
+ def find_python_files(directory: str) -> List[str]:
28
+ """Find all Python files in the given directory and its subdirectories."""
29
+ python_files = []
30
+ # Look in nemo directory and other relevant directories
31
+ relevant_dirs = ['nemo', 'scripts', 'examples', 'tests']
32
+
33
+ for dir_name in relevant_dirs:
34
+ dir_path = os.path.join(directory, dir_name)
35
+ if os.path.exists(dir_path):
36
+ for root, _, files in os.walk(dir_path):
37
+ for file in files:
38
+ if file.endswith('.py'):
39
+ python_files.append(os.path.join(root, file))
40
+
41
+ return python_files
42
+
43
+
44
+ def analyze_imports(nemo_root: str, file_path: str) -> Set[str]:
45
+ """Analyze a Python file and return its NeMo package dependencies using AST parsing."""
46
+ imports = set()
47
+ visited = set() # Track visited modules to prevent circular imports
48
+
49
+ def get_init_imports(module_path: str, depth: int = 0) -> Dict[str, str]:
50
+ """Recursively analyze imports from __init__.py files and map them to their final destinations."""
51
+ # Prevent infinite recursion
52
+ if depth > 10 or module_path in visited: # Limit depth to 10 levels
53
+ return {}
54
+
55
+ visited.add(module_path)
56
+ init_path = os.path.join(module_path, '__init__.py')
57
+ if not os.path.exists(init_path):
58
+ return {}
59
+
60
+ try:
61
+ with open(init_path, 'r', encoding='utf-8') as f:
62
+ init_tree = ast.parse(f.read(), filename=init_path)
63
+
64
+ import_map = {}
65
+ for node in ast.walk(init_tree):
66
+ if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
67
+ if node.names:
68
+ for name in node.names:
69
+ if name.name == '*':
70
+ continue
71
+
72
+ # Get the full module path for the import
73
+ module_parts = node.module.split('.')
74
+ module_dir = os.path.join(nemo_root, *module_parts)
75
+
76
+ # If the imported module has an __init__.py, recursively analyze it
77
+ if os.path.exists(os.path.join(module_dir, '__init__.py')):
78
+ sub_imports = get_init_imports(module_dir, depth + 1)
79
+ if name.name in sub_imports:
80
+ import_map[name.name] = sub_imports[name.name]
81
+ else:
82
+ # If not found in sub-imports, it might be from the module itself
83
+ module_file = os.path.join(module_dir, f"{module_parts[-1]}.py")
84
+ if os.path.exists(module_file):
85
+ import_map[name.name] = f"{node.module}.{name.name}"
86
+ else:
87
+ # Direct module import
88
+ import_map[name.name] = f"{node.module}.{name.name}"
89
+
90
+ return import_map
91
+ except Exception as e:
92
+ print(f"Error analyzing {init_path}: {e}")
93
+ return {}
94
+
95
+ try:
96
+ with open(file_path, 'r', encoding='utf-8') as f:
97
+ tree = ast.parse(f.read(), filename=file_path)
98
+
99
+ for node in ast.walk(tree):
100
+ if isinstance(node, ast.ImportFrom) and node.module and node.module.startswith('nemo.'):
101
+ # Split the module path
102
+ parts = node.module.split('.')
103
+
104
+ if len(parts) == 1:
105
+ continue
106
+
107
+ if len(parts) >= 2:
108
+ module_type = parts[1]
109
+
110
+ if module_type == 'collections':
111
+ if len(parts) == 2:
112
+ continue
113
+ if node.names:
114
+ for name in node.names:
115
+ if name.name == '*':
116
+ continue
117
+
118
+ # Check if this is an __init__ import
119
+ module_path = os.path.join(nemo_root, *parts)
120
+ init_imports = get_init_imports(module_path)
121
+
122
+ if name.name in init_imports:
123
+ # Use the mapped import path
124
+ imports.add(init_imports[name.name])
125
+ else:
126
+ imports.add(f"{node.module}.{name.name}")
127
+
128
+ elif module_type in find_top_level_packages(nemo_root):
129
+ if node.names:
130
+ for name in node.names:
131
+ if name.name == '*':
132
+ continue
133
+
134
+ # Check if this is an __init__ import
135
+ module_path = os.path.join(nemo_root, *parts)
136
+ init_imports = get_init_imports(module_path)
137
+
138
+ if name.name in init_imports:
139
+ # Use the mapped import path
140
+ imports.add(init_imports[name.name])
141
+ else:
142
+ imports.add(f"{node.module}.{name.name}")
143
+
144
+ except Exception as e:
145
+ print(f"Error analyzing {file_path}: {e}")
146
+
147
+ return imports
148
+
149
+
150
+ def find_top_level_packages(nemo_root: str) -> List[str]:
151
+ """Find all top-level packages under nemo directory."""
152
+ packages: List[str] = []
153
+ nemo_dir = os.path.join(nemo_root, 'nemo')
154
+ tests_dir = os.path.join(nemo_root, 'tests')
155
+
156
+ if not os.path.exists(nemo_dir):
157
+ print(f"Warning: nemo directory not found at {nemo_dir}")
158
+ return packages
159
+ if not os.path.exists(tests_dir):
160
+ print(f"Warning: nemo directory not found at {nemo_dir}")
161
+ return packages
162
+
163
+ for item in os.listdir(nemo_dir) + os.listdir(tests_dir):
164
+ item_path = os.path.join(nemo_dir, item)
165
+ if os.path.isdir(item_path) and not item.startswith('__'):
166
+ packages.append(item)
167
+
168
+ return sorted(packages)
169
+
170
+
171
+ def find_collection_modules(nemo_root: str) -> Dict[str, List[str]]:
172
+ """Find all modules within collections."""
173
+ collection_modules: Dict[str, List[str]] = {}
174
+ collections_dir = os.path.join(nemo_root, 'nemo', 'collections')
175
+
176
+ if not os.path.exists(collections_dir):
177
+ print(f"Warning: collections directory not found at {collections_dir}")
178
+ return collection_modules
179
+
180
+ for collection in os.listdir(collections_dir):
181
+ collection_path = os.path.join(collections_dir, collection)
182
+ if os.path.isdir(collection_path) and not collection.startswith('__'):
183
+ collection_modules[f"nemo.collections.{collection}"] = []
184
+
185
+ return collection_modules
186
+
187
+
188
+ def build_dependency_graph(nemo_root: str) -> Dict[str, List[str]]:
189
+ """Build a dependency graph by analyzing all Python files."""
190
+ # Find all top-level packages
191
+ top_level_packages = find_top_level_packages(nemo_root)
192
+ print(f"Found top-level packages: {top_level_packages}")
193
+
194
+ dependencies: Dict[str, List[str]] = {}
195
+
196
+ for file_path in find_python_files(nemo_root):
197
+ relative_path = os.path.relpath(file_path, nemo_root)
198
+
199
+ parts = relative_path.split(os.sep)
200
+
201
+ if len(parts) == 1 or (parts[0] != "nemo" and parts[0] != "tests"):
202
+ continue
203
+
204
+ module_path = relative_path.replace(".py", "").replace("/", ".")
205
+ if parts[1] in top_level_packages and parts[1] != 'collections' and parts[0] != 'tests':
206
+ dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
207
+ elif parts[0] == 'tests':
208
+ dependencies[module_path] = [relative_path.replace("/", ".").replace(".py", "")]
209
+ elif parts[1] == 'collections':
210
+ dependencies[module_path] = list(set(analyze_imports(nemo_root, file_path)))
211
+
212
+ # Flip the dependency graph to show reverse dependencies
213
+ reverse_dependencies: Dict[str, List[str]] = {}
214
+ # Handle top-level package dependencies
215
+ for package, deps in dependencies.items():
216
+ for dep in deps:
217
+ if dep not in reverse_dependencies:
218
+ reverse_dependencies[dep] = []
219
+ reverse_dependencies[dep].append(package)
220
+ dependencies = reverse_dependencies
221
+
222
+ # Follow and extend records with transitive dependencies
223
+ transitive_dependencies = dependencies.copy()
224
+ # Keep iterating until no new dependencies are added
225
+ while True:
226
+ changes_made = False
227
+ new_dependencies = transitive_dependencies.copy()
228
+
229
+ # For each package and its direct dependencies
230
+ for package, deps in transitive_dependencies.items():
231
+ # For each direct dependency
232
+ for dep in deps:
233
+ # If the dependency has its own dependencies
234
+ if dep in transitive_dependencies:
235
+ # Add those transitive dependencies to the original package
236
+ for transitive_dep in transitive_dependencies[dep]:
237
+ if transitive_dep not in new_dependencies[package]:
238
+ new_dependencies[package].append(transitive_dep)
239
+ changes_made = True
240
+
241
+ # Update dependencies with new transitive ones
242
+ transitive_dependencies = new_dependencies
243
+
244
+ # If no new dependencies were added, we're done
245
+ if not changes_made:
246
+ break
247
+
248
+ dependencies = transitive_dependencies
249
+
250
+ # Simplify values: Either top-level package or collection module
251
+ simplified_dependencies: Dict[str, List[str]] = {}
252
+ for package, deps in dependencies.items():
253
+ package_parts = package.split('.')
254
+
255
+ if package_parts[0] == "tests":
256
+ simplified_package_path = f"{os.path.join(*package_parts)}.py"
257
+ elif os.path.isfile((file_path := f"{os.path.join(*package_parts[:-1])}.py")):
258
+ simplified_package_path = file_path
259
+ elif os.path.isdir((file_path := f"{os.path.join(*package_parts[:-1])}")):
260
+ simplified_package_path = file_path
261
+ else:
262
+ simplified_package_path = package
263
+
264
+ for dep in deps:
265
+ dep_parts = dep.split('.')
266
+
267
+ if simplified_package_path not in simplified_dependencies:
268
+ simplified_dependencies[simplified_package_path] = []
269
+
270
+ if (
271
+ len(dep_parts) >= 2
272
+ and (dep_parts[1] in find_top_level_packages(nemo_root))
273
+ and dep_parts[1] != 'collections'
274
+ ):
275
+ simplified_dependencies[simplified_package_path].append(f"{dep_parts[0]}.{dep_parts[1]}")
276
+ elif dep_parts[0] == "tests":
277
+ simplified_dependencies[simplified_package_path].append(".".join(dep_parts))
278
+ elif len(dep_parts) >= 3 and (
279
+ simplified_name := f"nemo.{dep_parts[1]}.{dep_parts[2]}"
280
+ ) in find_collection_modules(nemo_root):
281
+ simplified_dependencies[simplified_package_path].append(simplified_name)
282
+
283
+ simplified_dependencies[simplified_package_path].append(package)
284
+ simplified_dependencies[simplified_package_path] = sorted(
285
+ list(set(simplified_dependencies[simplified_package_path]))
286
+ )
287
+ dependencies = simplified_dependencies
288
+
289
+ # Bucket
290
+ bucket_deps: Dict[str, List[str]] = {}
291
+ for package, deps in dependencies.items():
292
+ new_deps = []
293
+ for dep in deps:
294
+ if (
295
+ "nemo.collections.asr" in dep
296
+ or "nemo.collections.tts" in dep
297
+ or "nemo.collections.speechlm" in dep
298
+ or "nemo.collections.audio" in dep
299
+ or "tests.collections.asr" in dep
300
+ or "tests.collections.tts" in dep
301
+ or "tests.collections.speechlm" in dep
302
+ or "tests.collections.audio" in dep
303
+ ):
304
+ new_deps.append("speech")
305
+ new_deps.append("unit-tests")
306
+
307
+ if "nemo.export" in dep or "nemo.deploy" in dep or "tests.export" in dep or "tests.deploy" in dep:
308
+ new_deps.append("export-deploy")
309
+ new_deps.append("unit-tests")
310
+
311
+ if (
312
+ "nemo.collections.llm" in dep
313
+ or "nemo.collections.vlm" in dep
314
+ or "nemo.automodel" in dep
315
+ or "tests.collections.llm" in dep
316
+ or "tests.collections.vlm" in dep
317
+ or "tests.automodel" in dep
318
+ ):
319
+ new_deps.append("automodel")
320
+ new_deps.append("unit-tests")
321
+
322
+ if "tests" in dep and "tests.functional_tests" not in dep:
323
+ new_deps.append("unit-tests")
324
+
325
+ if (
326
+ "nemo.collections" in dep
327
+ and "nemo.collections.asr" not in dep
328
+ and "nemo.collections.tts" not in dep
329
+ and "nemo.collections.speechlm" not in dep
330
+ and "nemo.collections.audio" not in dep
331
+ and "tests.collections.asr" not in dep
332
+ and "tests.collections.tts" not in dep
333
+ and "tests.collections.speechlm" not in dep
334
+ and "tests.collections.audio" not in dep
335
+ ):
336
+ new_deps.append("nemo2")
337
+ new_deps.append("unit-tests")
338
+
339
+ bucket_deps[package] = sorted(list(set(new_deps)))
340
+
341
+ dependencies = bucket_deps
342
+
343
+ # Additional dependencies
344
+ # Add all files in requirements/ directory
345
+ requirements_dir = os.path.join(nemo_root, "requirements")
346
+ if os.path.exists(requirements_dir):
347
+ for filename in os.listdir(requirements_dir):
348
+ filepath = os.path.join("requirements", filename)
349
+ relative_path = os.path.relpath(filepath, nemo_root)
350
+
351
+ dependencies[relative_path] = [
352
+ "nemo2",
353
+ "unit-tests",
354
+ "speech",
355
+ "automodel",
356
+ "export-deploy",
357
+ ]
358
+
359
+ # Add all Dockerfile files
360
+ for root, _, files in os.walk(nemo_root):
361
+ for file_path in files:
362
+ full_path = os.path.join(root, file_path)
363
+ relative_path = os.path.relpath(full_path, nemo_root)
364
+
365
+ if "cicd-main-export-deploy" in file_path:
366
+ dependencies[relative_path] = ["export-deploy"]
367
+ if "cicd-main-nemo2" in file_path:
368
+ dependencies[relative_path] = ["nemo2"]
369
+ if "cicd-main-speech" in file_path:
370
+ dependencies[relative_path] = ["speech"]
371
+ if "cicd-main-automodel" in file_path:
372
+ dependencies[relative_path] = ["automodel"]
373
+ if "cicd-main-unit-tests" in file_path:
374
+ dependencies[relative_path] = ["unit-tests"]
375
+ if "Dockerfile" in file_path:
376
+ dependencies[relative_path] = ["nemo2", "unit-tests", "speech", "automodel", "export-deploy"]
377
+
378
+ # Sort dependencies by length of values (number of dependencies)
379
+ dependencies = dict(sorted(dependencies.items(), key=lambda x: len(x[1]), reverse=True))
380
+
381
+ return dependencies
382
+
383
+
384
+ def main():
385
+ """Main function to analyze dependencies and output JSON."""
386
+ # Get the root directory of the NeMo project
387
+ nemo_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
388
+
389
+ # Build dependency graph
390
+ dependencies = build_dependency_graph(nemo_root)
391
+
392
+ # Output as JSON
393
+ data = json.dumps(dependencies, indent=4)
394
+
395
+ with open('nemo_dependencies.json', 'w', encoding='utf-8') as f:
396
+ f.write(data)
397
+
398
+
399
+ if __name__ == "__main__":
400
+ main()
.github/scripts/notify.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+
16
+ import requests
17
+ from github import Github
18
+
19
+
20
+ def send_slack_notification():
21
+ # Get environment variables
22
+ gh_token = os.environ.get('GH_TOKEN')
23
+ webhook_url = os.environ.get('SLACK_WEBHOOK')
24
+ repository = os.environ.get('REPOSITORY')
25
+ run_id = os.environ.get('RUN_ID')
26
+ server_url = os.environ.get('SERVER_URL', 'https://github.com')
27
+ pr_number = int(os.environ.get('PR_NUMBER', 0))
28
+ branch_name = os.environ.get('BRANCH_NAME')
29
+
30
+ # Get failure info from GitHub API
31
+ gh = Github(gh_token)
32
+ repo = gh.get_repo(repository)
33
+
34
+ # Get failed jobs
35
+ failed_jobs = [job.name for job in repo.get_workflow_run(int(run_id)).jobs() if job.conclusion == 'failure']
36
+
37
+ if pr_number != 0:
38
+ pr = repo.get_pull(pr_number)
39
+
40
+ title = f"*<{server_url}/{repository}/pull/{pr_number}|PR#{pr_number}>: {pr.title.replace('`', '')}*"
41
+ author = f"<{server_url}/{pr.user.login}|{pr.user.login}>"
42
+ branch = f"<{server_url}/{pr.head.repo.full_name}/tree/{pr.head.ref}|{pr.head.ref}>"
43
+ else:
44
+ title = f"*Run on <{server_url}/{repository}/tree/{branch_name}|{branch_name}>*"
45
+ author = "No author"
46
+ branch = f"<{server_url}/{repository}/tree/{branch_name}|{branch_name}>"
47
+
48
+ blocks = [
49
+ {
50
+ "type": "section",
51
+ "text": {
52
+ "type": "mrkdwn",
53
+ "text": (
54
+ f"{title}\n"
55
+ f"• Author: {author}\n"
56
+ f"• Branch: {branch}\n"
57
+ f"• Pipeline: <{server_url}/{repository}/actions/runs/{run_id}|View Run>\n"
58
+ f"• Failed Jobs:\n"
59
+ + "\n".join(
60
+ [
61
+ f" • <{server_url}/{repository}/actions/runs/{run_id}|{job.split('/')[-1]}>"
62
+ for job in failed_jobs
63
+ if job.split('/')[-1] != 'Nemo_CICD_Test'
64
+ ]
65
+ )
66
+ ),
67
+ },
68
+ }
69
+ ]
70
+
71
+ print({"blocks": blocks})
72
+
73
+ # Send to Slack
74
+ response = requests.post(webhook_url, json={"blocks": blocks})
75
+ response.raise_for_status()
76
+
77
+
78
+ if __name__ == "__main__":
79
+ send_slack_notification()
.github/workflows/_build_container.yml ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ~Build container template
2
+ on:
3
+ workflow_call:
4
+ inputs:
5
+ image-name:
6
+ required: true
7
+ type: string
8
+ description: "The name of the image to build"
9
+ dockerfile:
10
+ required: true
11
+ type: string
12
+ runner:
13
+ required: false
14
+ default: self-hosted-azure-builder
15
+ type: string
16
+ description: "The runner to use for the build"
17
+
18
+ jobs:
19
+ pre-flight:
20
+ runs-on: ubuntu-latest
21
+ outputs:
22
+ build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
23
+ cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
24
+ steps:
25
+ - name: Checkout repository
26
+ uses: actions/checkout@v4
27
+
28
+ - name: Parse manifest.json
29
+ id: manifest
30
+ run: |
31
+ BUILD_ARGS=$(cat << EOF
32
+ BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
33
+ TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
34
+ TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
35
+ MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
36
+ MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
37
+ TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
38
+ TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
39
+ APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
40
+ APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
41
+ EOF
42
+ )
43
+
44
+ echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
45
+ echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
46
+ echo "EOF" >> $GITHUB_OUTPUT
47
+
48
+ - name: Get last merged PR
49
+ id: cache_from
50
+ env:
51
+ GH_TOKEN: ${{ github.token }}
52
+ run: |
53
+ LAST_PRS=$(gh api graphql -f query='
54
+ query {
55
+ repository(owner: "NVIDIA", name: "NeMo") {
56
+ pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
57
+ nodes {
58
+ number
59
+ }
60
+ }
61
+ }
62
+ }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
63
+ echo "nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:$number"
64
+ done)
65
+
66
+ echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
67
+ echo "$LAST_PRS" >> $GITHUB_OUTPUT
68
+ echo "EOF" >> $GITHUB_OUTPUT
69
+
70
+ build:
71
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.27.0
72
+ needs: [pre-flight]
73
+ with:
74
+ image-name: ${{ inputs.image-name }}
75
+ dockerfile: ${{ inputs.dockerfile }}
76
+ image-label: nemo-core
77
+ build-args: |
78
+ IMAGE_LABEL=nemo-core
79
+ NEMO_TAG=${{ github.sha }}
80
+ NEMO_REPO=https://github.com/NVIDIA/NeMo
81
+ PR_NUMBER=${{ github.event.pull_request.number || 0 }}
82
+ ${{ needs.pre-flight.outputs.build_args }}
83
+ prune-filter-timerange: 24h
84
+ use-inline-cache: false
85
+ cache-from: |
86
+ nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:main
87
+ nemoci.azurecr.io/${{ inputs.image-name }}-buildcache:${{ github.event.pull_request.number || 0 }}
88
+ ${{ needs.pre-flight.outputs.cache-from }}
89
+ runner: ${{ inputs.runner }}
.github/workflows/_bump_mcore_tag.yml ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ~Bump Megatron Tag template
2
+ on:
3
+ workflow_call:
4
+ inputs:
5
+ nemo-target-branch:
6
+ required: true
7
+ type: string
8
+ description: "The target branch to bump"
9
+ mcore-target-branch:
10
+ required: true
11
+ type: string
12
+ description: "The target branch to bump"
13
+ secrets:
14
+ PAT:
15
+ required: true
16
+
17
+ jobs:
18
+ update-branch:
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v2
22
+ with:
23
+ ref: ${{ inputs.nemo-target-branch }}
24
+
25
+ - name: Set Git config
26
+ run: |
27
+ git config --local user.email "actions@github.com"
28
+ git config --local user.name "Github Actions"
29
+ - name: Merge weekly-bump-${{ inputs.nemo-target-branch }} back to base branch
30
+ env:
31
+ SOURCE_BRANCH: weekly-bump-${{ inputs.nemo-target-branch }}
32
+ TARGET_BRANCH: ${{ inputs.nemo-target-branch }}
33
+ run: |
34
+ if git ls-remote --exit-code origin $SOURCE_BRANCH; then
35
+ git fetch --unshallow
36
+ git checkout $SOURCE_BRANCH
37
+ git pull
38
+ git merge --no-ff $TARGET_BRANCH -m "chore: Auto-merge $TARGET_BRANCH into $SOURCE_BRANCH"
39
+ else
40
+ git checkout -b $SOURCE_BRANCH $TARGET_BRANCH
41
+ fi
42
+ git push -u origin $SOURCE_BRANCH
43
+
44
+ mcore:
45
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_bump_yamlfile.yml@v0.27.1
46
+ needs: [update-branch]
47
+ with:
48
+ source-repository: NVIDIA/Megatron-LM
49
+ source-ref: ${{ inputs.mcore-target-branch }}
50
+ yaml-path: '."vcs-dependencies"."megatron-lm".ref'
51
+ file: requirements/manifest.json
52
+ base-branch: weekly-bump-${{ inputs.nemo-target-branch }}
53
+ cicd-labels: Run CICD,no-fail-fast
54
+ pr-reviewers: ${{ inputs.pr-reviewers }}
55
+ secrets:
56
+ PAT: ${{ secrets.PAT }}
.github/workflows/build-test-publish-wheel.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Build, test, and publish a PyPi wheel (to testpypi).
16
+
17
+ on:
18
+ push:
19
+ branches:
20
+ - main
21
+ - "r**"
22
+
23
+ defaults:
24
+ run:
25
+ shell: bash -x -e -u -o pipefail {0}
26
+
27
+ jobs:
28
+ build-test-publish-wheel:
29
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_test_publish_wheel.yml@v0.22.3
30
+ with:
31
+ dry-run: true
32
+ python-package: nemo
33
+ python-version: "3.10"
34
+ secrets:
35
+ TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
36
+ TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
37
+ SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
38
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
.github/workflows/changelog-build.yml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 'Changelog Build (Release)'
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ last-release-tag:
7
+ description: Last Git tag to start from (exclusive) (e.g. `v2.0.0`)
8
+ type: string
9
+ required: true
10
+ release-branch:
11
+ description: Release branch to build changelog on (e.g. `r2.1.0`)
12
+ type: string
13
+ required: true
14
+
15
+ jobs:
16
+ changelog:
17
+ runs-on: ubuntu-latest
18
+ steps:
19
+ - name: Checkout branch
20
+ uses: actions/checkout@v4
21
+ with:
22
+ ref: main
23
+ fetch-depth: 0
24
+
25
+ - name: Build Changelog
26
+ id: github_tag
27
+ uses: mikepenz/release-changelog-builder-action@v3.3.1
28
+ env:
29
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
30
+ with:
31
+ # Configuration file is setup with filters for domains
32
+ # owner:repo must point to current repo
33
+ # fromTag: Auto resolved from historical tag order (previous tag compared to current tag)
34
+ # toTag: Current tag reference
35
+ configuration: ".github/workflows/config/changelog-config.json"
36
+ owner: "NVIDIA"
37
+ repo: "NeMo"
38
+ ignorePreReleases: "false"
39
+ failOnError: "false"
40
+ fromTag: ${{ inputs.last-release-tag }}
41
+ toTag: ${{ inputs.release-branch }}
42
+
43
+ - name: Update changelog file
44
+ env:
45
+ RELEASE_BRANCH: ${{ inputs.release-branch }}
46
+ CHANGELOG: ${{ steps.github_tag.outputs.changelog }}
47
+ shell: bash -x -e -u -o pipefail {0}
48
+ run: |
49
+ RELEASE_VERSION=${RELEASE_BRANCH#r}
50
+ CHANGELOG=$(echo "$CHANGELOG" | sed '/^[[:blank:]]*#/s/#/###/')
51
+
52
+ RELEASE_NOTES="## NVIDIA Neural Modules $RELEASE_VERSION
53
+
54
+ ### Detailed Changelogs:
55
+
56
+ $CHANGELOG"
57
+
58
+ printf "%s\n" "$RELEASE_NOTES" | sed '/<!-- Next changelog -->/r /dev/stdin' CHANGELOG.md > CHANGELOG.tmp.md
59
+
60
+ mv CHANGELOG.tmp.md CHANGELOG.md
61
+
62
+ - name: Inspect new changelog file
63
+ run: cat CHANGELOG.md
64
+
65
+ - name: Create Pull Request
66
+ uses: peter-evans/create-pull-request@v7
67
+ with:
68
+ commit-message: "beep boop: Update changelog"
69
+ title: "Update changelog for `${{ inputs.release-branch }}`"
70
+ signoff: true
71
+ sign-commits: true
72
+ base: main
73
+ branch: bot/chore/update-changelog-into-${{ inputs.release-branch }}
.github/workflows/cherry-pick-release-commit.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create PR to main with cherry-pick from release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ cherry-pick:
10
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cherry_pick.yml@v0.22.7
11
+ secrets:
12
+ PAT: ${{ secrets.PAT }}
13
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
14
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
.github/workflows/cicd-approve-test-queue.yml ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Approve Test Queue
16
+
17
+ on:
18
+ schedule:
19
+ - cron: '*/5 * * * *' # Runs every 5 minutes
20
+ workflow_dispatch: # Allows manual triggering
21
+
22
+ jobs:
23
+ approve-queue:
24
+ runs-on: ubuntu-latest
25
+ environment: main
26
+ steps:
27
+ - name: Checkout repository
28
+ uses: actions/checkout@v4
29
+
30
+ - name: Set up Python
31
+ uses: actions/setup-python@v5
32
+ with:
33
+ python-version: "3.12"
34
+
35
+ - name: Install dependencies
36
+ run: |
37
+ python -m pip install --upgrade pip
38
+ pip install requests
39
+
40
+ - name: Approve waiting deployments
41
+ env:
42
+ GITHUB_TOKEN: ${{ secrets.PAT }}
43
+ MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
44
+ run: |
45
+ python - <<EOF
46
+ import os
47
+ import requests
48
+
49
+
50
+ # GitHub API configuration
51
+ GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
52
+ REPO = os.environ["GITHUB_REPOSITORY"]
53
+ MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
54
+ API_BASE = f"https://api.github.com/repos/{REPO}"
55
+
56
+ # Headers for GitHub API
57
+ headers = {
58
+ "Authorization": f"token {GITHUB_TOKEN}",
59
+ "Accept": "application/vnd.github.v3+json",
60
+ "X-GitHub-Api-Version": "2022-11-28",
61
+ }
62
+
63
+ def make_request(endpoint, method="GET", data=None):
64
+ """Make a request to the GitHub API with error handling."""
65
+ url = f"{API_BASE}/{endpoint}"
66
+ try:
67
+ if method == "GET":
68
+ response = requests.get(url, headers=headers)
69
+ else:
70
+ response = requests.post(url, headers=headers, json=data)
71
+ response.raise_for_status()
72
+ response_json = response.json()
73
+ if hasattr(response, "links") and "actions/runs?status" in endpoint:
74
+ response_json["next"] = response.links.get("next", {}).get("url")
75
+
76
+ return response_json
77
+ except requests.exceptions.RequestException as e:
78
+ print(f"Error making request to {endpoint}: {str(e)}")
79
+ if hasattr(e.response, 'text'):
80
+ print(f"Response: {e.response.text}")
81
+ return None
82
+
83
+
84
+ def get_workflow_runs(status):
85
+ """Get all workflow runs for a given status."""
86
+ all_results = []
87
+ endpoint = f"actions/runs?status={status}"
88
+ while endpoint:
89
+ response = make_request(endpoint)
90
+ if not response:
91
+ break
92
+
93
+ all_results.extend(response.get("workflow_runs", []))
94
+ endpoint = None
95
+ next_url = response.get("next")
96
+ if next_url:
97
+ endpoint = f"actions/runs?{next_url.split('?')[1]}"
98
+
99
+ return all_results
100
+
101
+
102
+ # Get current running and queued workflows
103
+ print("Fetching workflow runs...")
104
+ queued_workflow_runs = get_workflow_runs("queued")
105
+ in_progress_workflow_runs = get_workflow_runs("in_progress")
106
+
107
+ # Count running and queued workflows
108
+ queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
109
+ in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")
110
+
111
+ total_workflows = queued_workflows + in_progress_workflows
112
+ print(f"Current queued workflows: {queued_workflows}")
113
+ print(f"Current running workflows: {in_progress_workflows}")
114
+ print(f"Total workflows: {total_workflows}")
115
+ print(f"Max concurrency: {MAX_CONCURRENCY}")
116
+
117
+ if total_workflows >= MAX_CONCURRENCY:
118
+ print("Maximum concurrency reached, no new approvals will be made")
119
+ exit(0)
120
+
121
+ # Get waiting CI workflows for test environment
122
+ print("Fetching deployments...")
123
+ pending_workflows = get_workflow_runs("waiting")
124
+ pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]
125
+
126
+ # Sort deployments by creation date (oldest first)
127
+ print("Sorting workflows...")
128
+ pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
129
+
130
+ # Process each deployment
131
+ print("Processing ...")
132
+ for workflow in pending_workflows:
133
+ if total_workflows >= MAX_CONCURRENCY:
134
+ print("Maximum concurrency reached, stopping approvals")
135
+ break
136
+
137
+ workflow_id = workflow["id"]
138
+ workflow_name = workflow["display_title"]
139
+ print(f"Approving workflow {workflow_name} with Run Id: {workflow_id}")
140
+
141
+ deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
142
+ deployment = make_request(deployment_url)[0]
143
+ environment_id = deployment["environment"]["id"]
144
+
145
+ # Approve the deployment
146
+ status_data = {
147
+ "environment_ids": [environment_id],
148
+ "state": "approved",
149
+ "comment": "Automatically approved by queue manager"
150
+ }
151
+ result = make_request(deployment_url, method="POST", data=status_data)
152
+
153
+ if result:
154
+ total_workflows += 1
155
+ else:
156
+ print(f"Failed to approve deployment {deployment['id']}")
157
+ exit(1)
158
+
159
+ EOF
160
+ notify:
161
+ if: failure()
162
+ runs-on: ubuntu-latest
163
+ needs: [approve-queue]
164
+ steps:
165
+ - name: Notify
166
+ env:
167
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
168
+ SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
169
+ GITHUB_RUN_ID: ${{ github.run_id }}
170
+ GITHUB_REPOSITORY: ${{ github.repository }}
171
+ run: |
172
+ curl -X POST \
173
+ -H 'Content-type: application/json' \
174
+ --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Test-queue-approval-bot workflow> failed. Please review manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
175
+ $SLACK_WEBHOOK
.github/workflows/cicd-main-automodel.yml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo E2E Automodel Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+ image-name:
22
+ required: false
23
+ default: nemo_container_automodel
24
+ type: string
25
+
26
+ jobs:
27
+ build:
28
+ uses: ./.github/workflows/_build_container.yml
29
+ with:
30
+ image-name: ${{ inputs.image-name }}
31
+ dockerfile: docker/Dockerfile.ci
32
+
33
+ unit-tests:
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ include:
38
+ - script: L0_Unit_Tests_GPU_Automodel
39
+ runner: self-hosted-azure-gpus-1
40
+ - script: L0_Unit_Tests_CPU_Automodel
41
+ runner: self-hosted-azure-cpu
42
+ cpu-only: true
43
+ needs: [build]
44
+ runs-on: ${{ matrix.runner }}
45
+ name: ${{ matrix.script }}
46
+ steps:
47
+ - name: Checkout
48
+ uses: actions/checkout@v4
49
+ with:
50
+ path: ${{ github.run_id }}
51
+ - name: main
52
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
53
+ with:
54
+ runner: ${{ runner.name }}
55
+ script: ${{ matrix.script }}
56
+ is_unit_test: true
57
+ tests_to_run: ${{ inputs.test_to_run }}
58
+ image: ${{ inputs.image-name }}
59
+ cpu-only: ${{ matrix.cpu-only || false }}
60
+ is_optional: ${{ matrix.is-optional || false }}
61
+
62
+ e2e-tests:
63
+ strategy:
64
+ fail-fast: false
65
+ matrix:
66
+ include:
67
+ - runner: self-hosted-azure-gpus-1
68
+ script: L2_VLM_HF_Transformer_PEFT
69
+ - runner: self-hosted-azure
70
+ script: L2_VLM_HF_Transformer_PEFT_FSDP2
71
+ - runner: self-hosted-azure-gpus-1
72
+ script: L2_VLM_HF_Transformer_PEFT_4bit
73
+ is-optional: true
74
+ - runner: self-hosted-azure
75
+ script: L2_VLM_HF_Transformer_SFT_FSDP2
76
+ - runner: self-hosted-azure
77
+ script: L2_HF_Transformer_PEFT_notebook
78
+ - runner: self-hosted-azure-gpus-1
79
+ script: L2_HF_Transformer_PEFT
80
+ - runner: self-hosted-azure-gpus-1
81
+ script: L2_HF_Transformer_PEFT_nemorun
82
+ - runner: self-hosted-azure
83
+ script: L2_HF_Transformer_PEFT_2gpu
84
+ - runner: self-hosted-azure
85
+ script: L2_HF_Transformer_PEFT_2gpu_FSDP2_liger
86
+ - runner: azure-gpu-vm-runner1-h100
87
+ script: L2_HF_Transformer_PEFT_2gpu_FSDP2_fp8
88
+ - runner: self-hosted-azure
89
+ script: L2_HF_Transformer_PEFT_2gpu_FSDP2
90
+ - runner: self-hosted-azure
91
+ script: L2_HF_Transformer_PEFT_2gpu_nemorun
92
+ - runner: self-hosted-azure
93
+ script: L2_HF_Transformer_SFT_2gpu
94
+ - runner: self-hosted-azure
95
+ script: L2_HF_Transformer_SFT_2gpu_FSDP2
96
+ - runner: azure-gpu-vm-runner1-h100
97
+ script: L2_HF_Transformer_SFT_2gpu_FSDP2_fp8
98
+ - runner: self-hosted-azure
99
+ script: L2_HF_Transformer_SFT_2gpu_nemorun
100
+ - runner: self-hosted-azure
101
+ script: L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
102
+ - runner: self-hosted-azure
103
+ script: L2_HF_Transformer_SFT_FSDP2_2gpu
104
+ - runner: self-hosted-azure
105
+ script: L2_HF_Transformer_PT_2gpu
106
+ - runner: self-hosted-azure
107
+ script: L2_HF_Transformer_PT_2gpu_nemorun
108
+ - runner: self-hosted-azure-gpus-1
109
+ script: L2_HF_Transformer_PT
110
+ - runner: self-hosted-azure-gpus-1
111
+ script: L2_HF_Transformer_PT_nemorun
112
+ - runner: self-hosted-azure
113
+ script: L2_HF_Transformer_SFT_notebook
114
+ - runner: self-hosted-azure-gpus-1
115
+ script: L2_HF_Transformer_SFT
116
+ - runner: self-hosted-azure-gpus-1
117
+ script: L2_HF_Transformer_SFT_nemorun
118
+ - runner: self-hosted-azure-gpus-1
119
+ script: L2_HF_Transformer_SFT_TE_Acceleration
120
+ - runner: self-hosted-azure-gpus-1
121
+ script: L2_HF_Transformer_PT_TE_Acceleration
122
+ needs: [unit-tests]
123
+ runs-on: ${{ matrix.runner }}
124
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
125
+ steps:
126
+ - name: Checkout
127
+ uses: actions/checkout@v4
128
+ with:
129
+ path: ${{ github.run_id }}
130
+ - name: main
131
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
132
+ with:
133
+ runner: ${{ runner.name }}
134
+ script: ${{ matrix.script }}
135
+ tests_to_run: ${{ inputs.test_to_run }}
136
+ image: ${{ inputs.image-name }}
137
+ is_optional: ${{ matrix.is-optional || false }}
.github/workflows/cicd-main-export-deploy.yml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo E2E Export Deploy Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+ image-name:
22
+ required: false
23
+ default: nemo_container_export_deploy
24
+ type: string
25
+
26
+ jobs:
27
+ build:
28
+ uses: ./.github/workflows/_build_container.yml
29
+ with:
30
+ image-name: ${{ inputs.image-name }}
31
+ dockerfile: docker/Dockerfile.ci.export_deploy
32
+
33
+ unit-tests:
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ include:
38
+ - script: L0_Unit_Tests_GPU_Export_Deploy
39
+ runner: self-hosted-azure
40
+ is-optional: true
41
+ - script: L0_Unit_Tests_CPU_Export_Deploy
42
+ runner: self-hosted-azure-cpu
43
+ cpu-only: true
44
+ - script: L0_Unit_Tests_Eval_Legacy
45
+ runner: self-hosted-azure
46
+ - script: L0_Unit_Tests_Eval
47
+ runner: self-hosted-azure
48
+ - script: L0_Unit_Tests_Eval_Adapters
49
+ runner: self-hosted-azure
50
+ needs: [build]
51
+ runs-on: ${{ matrix.runner }}
52
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
53
+ steps:
54
+ - name: Checkout
55
+ uses: actions/checkout@v4
56
+ with:
57
+ path: ${{ github.run_id }}
58
+ - name: main
59
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
60
+ with:
61
+ runner: ${{ runner.name }}
62
+ script: ${{ matrix.script }}
63
+ is_unit_test: true
64
+ tests_to_run: ${{ inputs.test_to_run }}
65
+ image: ${{ inputs.image-name }}
66
+ cpu-only: ${{ matrix.cpu-only || false }}
67
+ is_optional: ${{ matrix.is-optional || false }}
68
+ e2e-tests:
69
+ strategy:
70
+ fail-fast: false
71
+ matrix:
72
+ include:
73
+ # Export tests
74
+ - script: L2_NeMo_2_Export_HF_TRT_LLM
75
+ runner: self-hosted-azure
76
+ - script: L2_NeMo_2_Export_Deploy_Query_In_Framework
77
+ runner: self-hosted-azure
78
+ is-optional: true
79
+ - script: L2_ONNX_TRT_LLM_Embedding_Export
80
+ runner: self-hosted-azure
81
+ - script: L2_NeMo_2_Export_TRT_LLM
82
+ runner: self-hosted-azure
83
+ - script: L2_NeMo_2_vLLM_Export_Llama
84
+ runner: self-hosted-azure
85
+ - script: L2_NeMo_2_vLLM_Export_Mixtral
86
+ runner: self-hosted-azure
87
+ - script: L2_NeMo_2_Export_In_Framework
88
+ runner: self-hosted-azure
89
+ - script: L2_NeMo_2_Export_Qnemo_TRT_LLM
90
+ runner: self-hosted-azure
91
+ - script: L2_NeMo_2_VLLM_VISION
92
+ runner: self-hosted-azure
93
+ - script: L2_NeMo_2_EVAL_Legacy
94
+ runner: self-hosted-azure-gpus-1
95
+ - script: L2_NeMo_2_EVAL_gsm8k
96
+ runner: self-hosted-azure-gpus-1
97
+ - script: L2_NeMo_2_EVAL_arcc
98
+ runner: self-hosted-azure-gpus-1
99
+ needs: [unit-tests]
100
+ runs-on: ${{ matrix.runner }}
101
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
102
+ steps:
103
+ - name: Checkout
104
+ uses: actions/checkout@v4
105
+ with:
106
+ path: ${{ github.run_id }}
107
+ - name: main
108
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
109
+ with:
110
+ runner: ${{ runner.name }}
111
+ script: ${{ matrix.script }}
112
+ tests_to_run: ${{ inputs.test_to_run }}
113
+ image: ${{ inputs.image-name }}
114
+ is_optional: ${{ matrix.is-optional || false }}
.github/workflows/cicd-main-nemo2.yml ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo E2E NeMo2 Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+ image-name:
22
+ required: false
23
+ default: nemo_container_nemo2
24
+ type: string
25
+
26
+ jobs:
27
+ build:
28
+ uses: ./.github/workflows/_build_container.yml
29
+ with:
30
+ image-name: ${{ inputs.image-name }}
31
+ dockerfile: docker/Dockerfile.ci
32
+
33
+ e2e-tests:
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ include:
38
+ - script: L2_NeMo_2_GPT_Pretraining_no_transformer_engine
39
+ runner: self-hosted-azure
40
+ - script: L2_NeMo_2_llama3_pretraining_recipe
41
+ runner: self-hosted-azure
42
+ - script: L2_NeMo_2_llama3_pytorch_profiler
43
+ runner: self-hosted-azure
44
+ - script: L2_NeMo_2_llama3_fault_tolerance_plugin
45
+ runner: self-hosted-azure
46
+ - script: L2_NeMo_2_llama3_straggler_detection
47
+ runner: self-hosted-azure
48
+ - script: L2_NeMo_2_llama3_local_ckpt
49
+ runner: self-hosted-azure
50
+ - script: L2_NeMo_2_GPT_DDP_Param_Parity_check
51
+ runner: self-hosted-azure
52
+ - script: L2_NeMo_2_Hyena_Conversion_from_HF
53
+ runner: self-hosted-azure
54
+ - script: L2_NeMo_2_Hyena_DDP_Pretraining_Test
55
+ runner: self-hosted-azure
56
+ - script: L2_NeMo_2_Hyena_Mixer_Test
57
+ runner: self-hosted-azure-gpus-2-h100
58
+ - script: L2_NeMo_2_Hyena_PP_Pretraining_Test
59
+ runner: self-hosted-azure
60
+ - script: L2_NeMo_2_Hyena_TP_Pretraining_Test
61
+ runner: self-hosted-azure
62
+ - script: L2_NeMo_2_Hyena_CP_Pretraining_Test
63
+ runner: self-hosted-azure
64
+ - script: L2_NeMo_2_SSM_Pretraining
65
+ runner: self-hosted-azure
66
+ - script: L2_NeMo_2_SSM_Finetuning
67
+ runner: self-hosted-azure-gpus-2-h100
68
+ - script: L2_NeMo_2_HF_MODEL_IMPORT
69
+ runner: self-hosted-azure
70
+ - script: L2_NeMo_2_jit_callback
71
+ runner: self-hosted-azure
72
+ - script: L2_NeMo_2_T5_Pretraining
73
+ runner: self-hosted-azure
74
+ - script: L2_NeMo_2_T5_MockData_Pretraining
75
+ runner: self-hosted-azure
76
+ - script: L2_NeMo_2_T5_Finetuning
77
+ runner: self-hosted-azure
78
+ - script: L2_NeMo_2_T5_Squad
79
+ runner: self-hosted-azure
80
+ - script: L2_NeMo_2_T5_LoRA
81
+ runner: self-hosted-azure
82
+ - script: L2_NeMo_2_BERT_Pretraining_Megatron
83
+ runner: self-hosted-azure
84
+ - script: L2_NeMo_2_BERT_Pretraining_HuggingFace
85
+ runner: self-hosted-azure
86
+ - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_TP2
87
+ runner: self-hosted-azure-gpus-2-h100
88
+ - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_PP2
89
+ runner: self-hosted-azure-gpus-2-h100
90
+ - script: L2_NeMo_2_NEVA_MOCK_PRETRAIN_CP2
91
+ runner: self-hosted-azure-gpus-2-h100
92
+ - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_TP2
93
+ runner: self-hosted-azure-gpus-2-h100
94
+ - script: L2_NeMo_2_NEVA_ENERGON_FINETUNE_TP2
95
+ runner: self-hosted-azure-gpus-2-h100
96
+ - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_PP2
97
+ runner: self-hosted-azure-gpus-2-h100
98
+ - script: L2_NeMo_2_NEVA_MOCK_FINETUNE_CP2
99
+ runner: self-hosted-azure-gpus-2-h100
100
+ - script: L2_NeMo_2_NEVA_PRELOADED_FINETUNE_PP2_SEQPACK_PAD
101
+ runner: self-hosted-azure-gpus-2-h100
102
+ - script: L2_NeMo_2_NEVA_PRELOADED_FINETUNE_PP2_SEQPACK_TRUNC
103
+ runner: self-hosted-azure-gpus-2-h100
104
+ - script: L2_NeMo_2_NEVA_LOAD_GENERATE
105
+ runner: self-hosted-azure-gpus-1
106
+ - script: L2_NeMo_2_LLAVA_IMPORT
107
+ runner: self-hosted-azure-gpus-1
108
+ - script: L2_NEMO_2_MLLAMA_Inference
109
+ runner: self-hosted-azure-gpus-1
110
+ - script: L2_NeMo_2_MLLAMA_MOCK_FINETUNE_TP2
111
+ runner: self-hosted-azure
112
+ - script: L2_NeMo_2_MLLAMA_PRELOADED_FINETUNE_TP2
113
+ runner: self-hosted-azure
114
+ - script: L2_NeMo_2_MLLAMA_ENERGON_FINETUNE_TP2
115
+ runner: self-hosted-azure
116
+ - script: L2_NeMo_2_MLLAMA_IMPORT
117
+ runner: self-hosted-azure-gpus-1
118
+ - script: L2_NeMo_2_Mixtral_Pretraining
119
+ runner: self-hosted-azure
120
+ - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS1
121
+ runner: self-hosted-azure
122
+ - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS2
123
+ runner: self-hosted-azure
124
+ - script: L2_NeMo_2_GPT_SFT_TP1PP2_MBS2
125
+ runner: self-hosted-azure
126
+ - script: L2_NeMo_2_GPT_SFT_TP2PP1_MBS2
127
+ runner: self-hosted-azure
128
+ - script: L2_NeMo_2_GPT_SFT_TP1PP1_MBS1_PACKED
129
+ runner: self-hosted-azure
130
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1
131
+ runner: self-hosted-azure
132
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS2
133
+ runner: self-hosted-azure
134
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP2_MBS2
135
+ runner: self-hosted-azure
136
+ - script: L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
137
+ runner: self-hosted-azure
138
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
139
+ runner: self-hosted-azure
140
+ - script: L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
141
+ runner: self-hosted-azure
142
+ - script: L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED
143
+ runner: self-hosted-azure
144
+ - script: L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_Chat
145
+ runner: self-hosted-azure
146
+ - script: L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2_exclude
147
+ runner: self-hosted-azure
148
+ - script: L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
149
+ runner: self-hosted-azure
150
+ - script: L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
151
+ runner: self-hosted-azure
152
+ - script: L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
153
+ runner: self-hosted-azure
154
+ - script: L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
155
+ runner: self-hosted-azure
156
+ - script: L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1_exclude
157
+ runner: self-hosted-azure
158
+ - script: L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
159
+ runner: self-hosted-azure
160
+ - script: L2_NEMO_2_LoRA_MERGE
161
+ runner: self-hosted-azure
162
+ - script: L2_NEMO_2_LoRA_Export
163
+ runner: self-hosted-azure-gpus-1
164
+ - script: L2_NEMO_2_LoRA_Inference
165
+ runner: self-hosted-azure-gpus-1
166
+ - script: L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact
167
+ runner: self-hosted-azure
168
+ - script: L2_NeMo_2_Automodel_PTQ_trtllm
169
+ runner: self-hosted-azure
170
+ - script: L2_NeMo_2_Automodel_PTQ_hf
171
+ runner: self-hosted-azure
172
+ - script: L2_NeMo_2_PTQ_Llama2_FP8_trtllm
173
+ runner: self-hosted-azure
174
+ - script: L2_NeMo_2_PTQ_Llama2_FP8_nemo
175
+ runner: self-hosted-azure
176
+ - script: L2_NeMo_2_PTQ_Unified_Export
177
+ runner: self-hosted-azure
178
+ - script: L2_NeMo_2_Distill_Llama3_TP1PP2
179
+ runner: self-hosted-azure
180
+ - script: L2_NeMo_2_Prune_Llama_TP1PP2
181
+ runner: self-hosted-azure
182
+ - script: L2_NeMo_2_GPT_Speculative_Llama3_TP2PP1
183
+ runner: self-hosted-azure
184
+ - script: L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
185
+ runner: self-hosted-azure
186
+ - script: L2_NeMo_2_LLAVA_NEXT_HF_CONVERSION
187
+ runner: self-hosted-azure
188
+ - script: L2_NeMo_2_LLAVA_NEXT_ENERGON_TRAIN
189
+ runner: self-hosted-azure
190
+ - script: L2_NeMo_2_LLAVA_NEXT_ENERGON_PACKED_TRAIN
191
+ runner: self-hosted-azure
192
+ - script: L2_NeMo_2_AVLM_MOCK_TRAINING
193
+ runner: self-hosted-azure
194
+ - script: L2_NeMo_2_AVLM_ENERGON_TRAIN
195
+ runner: self-hosted-azure
196
+ - script: L2_NeMo_2_AVLM_ENERGON_CP2_TRAIN
197
+ runner: self-hosted-azure
198
+ - script: L2_NeMo_2_CLIP_PRETRAIN
199
+ runner: self-hosted-azure
200
+ timeout: 20
201
+ - script: L2_NeMo_2_CLIP_INFER
202
+ runner: self-hosted-azure
203
+ - script: L2_NeMo_2_Auto_Configurator_llama_TP1_PP1_MBS124
204
+ runner: self-hosted-azure-gpus-1
205
+ - script: L2_NeMo_2_Auto_Configurator_bert_TP1_PP1_MBS124
206
+ runner: self-hosted-azure-gpus-1
207
+ - script: L2_NeMo_2_Auto_Configurator_t5_TP1_PP1_MBS124
208
+ runner: self-hosted-azure-gpus-1
209
+ - script: L2_NeMo_2_Auto_Configurator_callbacks
210
+ runner: self-hosted-azure-gpus-1
211
+ - script: L2_NeMo_2_Conversion_Test_Baichuan2
212
+ runner: self-hosted-azure
213
+ - script: L2_NeMo_2_Conversion_Test_ChatGLM
214
+ runner: self-hosted-azure
215
+ - script: L2_NeMo_2_Conversion_Test_DeepSeek
216
+ runner: self-hosted-azure
217
+ - script: L2_NeMo_2_Conversion_Test_Gemma
218
+ runner: self-hosted-azure
219
+ - script: L2_NeMo_2_Conversion_Test_Gemma2
220
+ runner: self-hosted-azure
221
+ - script: L2_NeMo_2_Conversion_Test_Gemma3_llm
222
+ runner: self-hosted-azure
223
+ - script: L2_NeMo_2_Conversion_Test_Gemma3_vlm
224
+ runner: self-hosted-azure
225
+ - script: L2_NeMo_2_Conversion_Test_Mistral
226
+ runner: self-hosted-azure
227
+ - script: L2_NeMo_2_Conversion_Test_Llama
228
+ runner: self-hosted-azure
229
+ - script: L2_NeMo_2_Conversion_Test_Llama_Embedding
230
+ runner: self-hosted-azure
231
+ - script: L2_NeMo_2_Conversion_Test_Llama4
232
+ runner: self-hosted-azure
233
+ - script: L2_NeMo_2_Conversion_Test_Llama4_Text
234
+ runner: self-hosted-azure
235
+ - script: L2_NeMo_2_PTQ_Llama4_FP8_nemo
236
+ runner: self-hosted-azure
237
+ - script: L2_NeMo_2_Conversion_Test_Nemotron
238
+ runner: self-hosted-azure
239
+ - script: L2_NeMo_2_Conversion_Test_Nemotron_H_4B
240
+ runner: self-hosted-azure
241
+ - script: L2_NeMo_2_Conversion_Test_Phi3Mini
242
+ runner: self-hosted-azure
243
+ - script: L2_NeMo_2_Conversion_Test_Qwen2
244
+ runner: self-hosted-azure
245
+ - script: L2_NeMo_2_Conversion_Test_Qwen3
246
+ runner: self-hosted-azure
247
+ - script: L2_NeMo_2_Conversion_Test_Starcoder
248
+ runner: self-hosted-azure
249
+ - script: L2_NeMo_2_Conversion_Test_Starcoder2
250
+ runner: self-hosted-azure
251
+ - script: L2_NeMo_2_Conversion_Test_BERT
252
+ runner: self-hosted-azure
253
+ - script: L2_NeMo_2_Conversion_Test_T5
254
+ runner: self-hosted-azure
255
+ - runner: self-hosted-azure
256
+ script: L2_NeMo_2_QWEN2VL_MOCK_FINETUNE_TP2
257
+ - runner: self-hosted-azure
258
+ script: L2_NeMo_2_QWEN2VL_PRELOADED_FINETUNE_TP2
259
+ - runner: self-hosted-azure
260
+ script: L2_NeMo_2_QWEN2VL_ENERGON_FINETUNE_TP2
261
+ - runner: self-hosted-azure
262
+ script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_PP2
263
+ - runner: self-hosted-azure
264
+ script: L2_NeMo_2_LLAMA4_MOCK_FINETUNE_CP2
265
+ - runner: self-hosted-azure
266
+ script: L2_NeMo_2_LLAMA4_ENERGON_FINETUNE_EP2
267
+ - runner: self-hosted-azure
268
+ script: L2_NeMo_2_Diffusion_Recipe_Test
269
+ - runner: self-hosted-azure
270
+ script: L2_NeMo_2_Diffusion_Taskencoder_Test
271
+ - runner: self-hosted-azure
272
+ script: L2_NeMo_2_Flux_Import_Test
273
+ is-optional: true
274
+ - runner: self-hosted-azure
275
+ script: L2_NeMo_2_Flux_Inference_Test
276
+ - runner: self-hosted-azure
277
+ script: L2_NeMo_2_Flux_Training_DDP_Test
278
+ - runner: self-hosted-azure
279
+ script: L2_NeMo_2_Flux_Training_FSDP_Test
280
+ - runner: self-hosted-azure
281
+ script: L2_NeMo_2_Flux_ControlNet_Training_DDP_Test
282
+ - runner: self-hosted-azure
283
+ script: L2_NeMo_2_Flux_ControlNet_Training_FSDP_Test
284
+
285
+
286
+ needs: [build]
287
+ runs-on: ${{ matrix.runner }}
288
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
289
+ steps:
290
+ - name: Checkout
291
+ uses: actions/checkout@v4
292
+ with:
293
+ path: ${{ github.run_id }}
294
+ - name: main
295
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
296
+ with:
297
+ runner: ${{ runner.name }}
298
+ script: ${{ matrix.script }}
299
+ tests_to_run: ${{ inputs.test_to_run }}
300
+ image: ${{ inputs.image-name }}
301
+ is_optional: ${{ matrix.is-optional || false }}
302
+ timeout: ${{ matrix.timeout || 10 }}
.github/workflows/cicd-main-speech.yml ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo E2E Speech Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+ image-name:
22
+ required: false
23
+ default: nemo_container_speech
24
+ type: string
25
+
26
+ jobs:
27
+ build:
28
+ uses: ./.github/workflows/_build_container.yml
29
+ with:
30
+ image-name: ${{ inputs.image-name }}
31
+ dockerfile: docker/Dockerfile.ci
32
+
33
+ unit-tests:
34
+ strategy:
35
+ fail-fast: false
36
+ matrix:
37
+ include:
38
+ - script: L0_Unit_Tests_GPU_ASR
39
+ runner: self-hosted-azure-gpus-1
40
+ timeout: 20
41
+ - script: L0_Unit_Tests_CPU_ASR
42
+ runner: self-hosted-azure-cpu
43
+ cpu-only: true
44
+ timeout: 20
45
+ - script: L0_Unit_Tests_GPU_TTS
46
+ runner: self-hosted-azure-gpus-1
47
+ - script: L0_Unit_Tests_CPU_TTS
48
+ runner: self-hosted-azure-cpu
49
+ cpu-only: true
50
+ - script: L0_Unit_Tests_GPU_Audio
51
+ runner: self-hosted-azure-gpus-1
52
+ - script: L0_Unit_Tests_CPU_Audio
53
+ runner: self-hosted-azure-cpu
54
+ cpu-only: true
55
+ - script: L0_Unit_Tests_GPU_SpeechLM2
56
+ runner: self-hosted-azure-gpus-1
57
+ timeout: 20
58
+ - script: L0_Unit_Tests_CPU_SpeechLM2
59
+ runner: self-hosted-azure-cpu
60
+ cpu-only: true
61
+ timeout: 20
62
+ needs: [build]
63
+ runs-on: ${{ matrix.runner }}
64
+ name: ${{ matrix.script }}
65
+ steps:
66
+ - name: Checkout
67
+ uses: actions/checkout@v4
68
+ with:
69
+ path: ${{ github.run_id }}
70
+ - name: main
71
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
72
+ with:
73
+ runner: ${{ runner.name }}
74
+ script: ${{ matrix.script }}
75
+ is_unit_test: true
76
+ tests_to_run: ${{ inputs.test_to_run }}
77
+ image: ${{ inputs.image-name }}
78
+ timeout: ${{ matrix.timeout || 10 }}
79
+ cpu-only: ${{ matrix.cpu-only || false }}
80
+ is_optional: ${{ matrix.is-optional || false }}
81
+
82
+ e2e-tests:
83
+ strategy:
84
+ fail-fast: false
85
+ matrix:
86
+ include:
87
+ - runner: self-hosted-azure-gpus-1
88
+ script: ASR_dev_run_Speech_to_Text
89
+ - runner: self-hosted-azure-gpus-1
90
+ script: ASR_dev_run_Speech_to_Text_WPE_CitriNet
91
+ - runner: self-hosted-azure-gpus-1
92
+ script: ASR_dev_run_Speech_Pre-training_-_CitriNet
93
+ - runner: self-hosted-azure-gpus-1
94
+ script: Optional_ASR_dev_run_Speech_To_Text_Finetuning
95
+ is-optional: true
96
+ - runner: self-hosted-azure-gpus-1
97
+ script: Optional_ASR_dev_run_Speech_To_Text_HF_Finetuning
98
+ is-optional: true
99
+ - runner: self-hosted-azure-gpus-1
100
+ script: ASR_dev_run_Speech_to_Text_WPE_-_Conformer
101
+ - runner: self-hosted-azure-gpus-1
102
+ script: ASR_dev_run-part_two_Speech_to_Text_WPE_-_Squeezeformer
103
+ - runner: self-hosted-azure-gpus-1
104
+ script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Text_multi-dataloader
105
+ - runner: self-hosted-azure-gpus-1
106
+ script: L2_ASR_Multi-dataloader_dev_run_Speech_to_Label_multi-dataloader
107
+ - runner: self-hosted-azure-gpus-1
108
+ script: L2_ASR_Adapters_Linear_Adapters
109
+ - runner: self-hosted-azure-gpus-1
110
+ script: L2_ASR_Adapters_RelPos_MHA_Adapters
111
+ - runner: self-hosted-azure
112
+ script: L2_Speech_to_Text_EMA
113
+ - runner: self-hosted-azure-gpus-1
114
+ script: L2_Speech_to_Text_AED
115
+ - runner: self-hosted-azure-gpus-1
116
+ script: L2_Speaker_dev_run_Speech_to_Label
117
+ - runner: self-hosted-azure
118
+ script: L2_Speech_Estimate_Duration_Bins
119
+ - runner: self-hosted-azure
120
+ script: L2_Speech_Batch_Size_OOMptimizer
121
+ - runner: self-hosted-azure
122
+ script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
123
+ is-optional: true
124
+ - runner: self-hosted-azure
125
+ script: L2_Speech_Transcription_Speech_to_Text_Transcribe
126
+ - runner: self-hosted-azure
127
+ script: L2_Speech_Transcription_Canary_Transcribe_Full_Manifest
128
+ - runner: self-hosted-azure
129
+ script: L2_Speech_Transcription_Canary_Transcribe_With_Prompt
130
+ - runner: self-hosted-azure
131
+ script: L2_Speech_Transcription_Canary_Transcribe_Audio_Dir
132
+ - runner: self-hosted-azure
133
+ script: L2_Longform_Speech_Transcription_Canary_Chunked_Infer_from_Audio_Dir
134
+ - runner: self-hosted-azure
135
+ script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Audio_Dir
136
+ - runner: self-hosted-azure
137
+ script: L2_Longform_Speech_Transcription_with_TimeStamps_Canary_Chunked_Infer_from_Manifest
138
+ - runner: self-hosted-azure-gpus-1
139
+ script: Speech_Checkpoints_tests
140
+ timeout: 20
141
+ - runner: self-hosted-azure-gpus-1
142
+ script: L2_Speaker_dev_run_Speaker_Recognition
143
+ - runner: self-hosted-azure-gpus-1
144
+ script: L2_Speaker_dev_run_Speaker_Diarization
145
+ - runner: self-hosted-azure-gpus-1
146
+ script: L2_Speaker_dev_run_EndtoEnd_Speaker_Diarization_Sortformer
147
+ - runner: self-hosted-azure
148
+ script: L2_Speaker_dev_run_EndtoEnd_Diarizer_Inference
149
+ - runner: self-hosted-azure
150
+ script: L2_Speaker_dev_run_Speaker_Diarization_with_ASR_Inference
151
+ - runner: self-hosted-azure
152
+ script: L2_Speaker_dev_run_Clustering_Diarizer_Inference
153
+ - runner: self-hosted-azure
154
+ script: L2_Speaker_dev_run_Neural_Diarizer_Inference
155
+ - runner: self-hosted-azure
156
+ script: L2_Speaker_dev_run_Multispeaker_ASR_Data_Simulation
157
+ - runner: self-hosted-azure
158
+ script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
159
+ - runner: self-hosted-azure
160
+ script: L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
161
+ - script: L2_HF_Transformer_SpeechLM_SFT_2gpu
162
+ runner: self-hosted-azure
163
+ - script: L2_SpeechLM_LoRA_TP1PP1_MBS2
164
+ runner: self-hosted-azure
165
+ - runner: self-hosted-azure-gpus-1
166
+ script: L2_TTS_Fast_dev_runs_1_Tacotron_2
167
+ - runner: self-hosted-azure
168
+ script: L2_TTS_Fast_dev_runs_1_WaveGlow
169
+ - runner: self-hosted-azure
170
+ script: L2_TTS_Fast_dev_runs_1_FastPitch
171
+ - runner: self-hosted-azure
172
+ script: L2_TTS_Fast_dev_runs_1_Hifigan
173
+ - runner: self-hosted-azure
174
+ script: L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
175
+ - runner: self-hosted-azure
176
+ script: L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference
177
+ - runner: self-hosted-azure
178
+ script: SPEECHLM_HF_Training_DuplexS2S
179
+ - runner: self-hosted-azure
180
+ script: SPEECHLM_HF_Training_DuplexS2SSpeechDecoder
181
+ - runner: self-hosted-azure
182
+ script: SPEECHLM_HF_Training_SALM
183
+ needs: [unit-tests]
184
+ runs-on: ${{ matrix.runner }}
185
+ name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
186
+ steps:
187
+ - name: Checkout
188
+ uses: actions/checkout@v4
189
+ with:
190
+ path: ${{ github.run_id }}
191
+ - name: main
192
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
193
+ with:
194
+ runner: ${{ runner.name }}
195
+ script: ${{ matrix.script }}
196
+ tests_to_run: ${{ inputs.test_to_run }}
197
+ image: ${{ inputs.image-name }}
198
+ is_optional: ${{ matrix.is-optional || false }}
.github/workflows/cicd-main-testcopy.yml ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: "[debug] CICD NeMo"
15
+ on:
16
+ schedule:
17
+ - cron: 0 0 * * *
18
+ - cron: "*/5 * * * *" # Runs every 5 minutes
19
+ push:
20
+ branches:
21
+ - main
22
+ workflow_dispatch:
23
+ inputs:
24
+ test_to_run:
25
+ required: false
26
+ default: all
27
+ type: string
28
+ description: Comma-separated list of tests to run. Use "all" to run the full test suite.
29
+
30
+ jobs:
31
+ pre-flight:
32
+ runs-on: ubuntu-latest
33
+ outputs:
34
+ test_to_run: ${{ steps.test_to_run.outputs.main }}
35
+ is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
36
+ no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
37
+ components_to_run: ${{ steps.components_to_run.outputs.main }}
38
+ env:
39
+ TESTS_TO_RUN: ${{ inputs.test_to_run }}
40
+ EVENT_NAME: ${{ github.event_name }}
41
+ HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
42
+ steps:
43
+ - name: Checkout branch
44
+ uses: actions/checkout@v4
45
+ with:
46
+ fetch-depth: 0
47
+
48
+ - name: Select components to run
49
+ id: components_to_run
50
+ run: |
51
+ pip install -U pip
52
+ pip install git-python
53
+
54
+ if [[ "$EVENT_NAME" == "pull_request" ]]; then
55
+ python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
56
+ else
57
+ echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json
58
+ fi
59
+
60
+ components_to_run=$(cat test_modules.json)
61
+
62
+ echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
63
+
64
+ - name: Select tests to run
65
+ id: test_to_run
66
+ run: |
67
+ # For manual dispatch, we replace `all` with the actual job names
68
+ if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
69
+ TESTS_TO_RUN=$TESTS_TO_RUN
70
+
71
+ # For correctly labeled PR, we replace `all` with the actual job names
72
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
73
+ TESTS_TO_RUN=all
74
+
75
+ # For incorrectly labeled PR, run no tests
76
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
77
+ TESTS_TO_RUN=""
78
+
79
+ # For push events, run all tests. This is so that we can generate coverage
80
+ # on branch `main`.
81
+ elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
82
+ TESTS_TO_RUN=all
83
+
84
+ else
85
+ echo "Unsupported event_name $EVENT_NAME provided".
86
+ exit 1
87
+ fi
88
+
89
+ parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
90
+ echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
91
+
92
+ - name: Check if this is a CI workload
93
+ shell: bash
94
+ id: is_ci_workload
95
+ run: |
96
+ branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
97
+
98
+ if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
99
+ is_ci_workload=true
100
+ echo "main=true" | tee -a "$GITHUB_OUTPUT"
101
+ else
102
+ is_ci_workload=false
103
+ fi
104
+
105
+ echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
106
+
107
+ - name: Check if no-fail-fast is set
108
+ shell: bash
109
+ id: no_fail_fast
110
+ env:
111
+ HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
112
+ run: |
113
+ if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
114
+ no_fail_fast=true
115
+ else
116
+ no_fail_fast=false
117
+ fi
118
+
119
+ echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
120
+
121
+ code-linting:
122
+ if: needs.pre-flight.outputs.test_to_run != '[]'
123
+ needs: [pre-flight]
124
+ uses: ./.github/workflows/code-linting.yml
125
+
126
+ cicd-wait-in-queue:
127
+ needs: [pre-flight]
128
+ runs-on: ubuntu-latest
129
+ environment: test
130
+ if: |
131
+ needs.pre-flight.outputs.test_to_run != '[]'
132
+ && needs.pre-flight.outputs.is_ci_workload == 'false'
133
+ steps:
134
+ - name: Running CI tests
135
+ run: |
136
+ echo "Running CI tests"
137
+
138
+ cicd-test-container-build:
139
+ uses: ./.github/workflows/_build_container.yml
140
+ needs: [pre-flight, code-linting, cicd-wait-in-queue]
141
+ if: |
142
+ needs.pre-flight.outputs.test_to_run != '[]'
143
+ && (
144
+ success()
145
+ || (
146
+ needs.cicd-wait-in-queue.result == 'skipped'
147
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
148
+ )
149
+ )
150
+ && !cancelled()
151
+ with:
152
+ image-name: nemo_container
153
+ dockerfile: docker/Dockerfile.ci
154
+
155
+ # cicd-import-tests:
156
+ # if: |
157
+ # needs.pre-flight.outputs.test_to_run != '[]'
158
+ # && (
159
+ # success()
160
+ # || (
161
+ # needs.cicd-wait-in-queue.result == 'skipped'
162
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
163
+ # )
164
+ # )
165
+ # && !cancelled()
166
+ # needs: [cicd-test-container-build, pre-flight]
167
+ # runs-on: self-hosted-azure-gpus-1
168
+ # steps:
169
+ # - name: Create UUID
170
+ # id: uuid
171
+ # run: |
172
+ # echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
173
+
174
+ # - name: Checkout NeMo
175
+ # uses: actions/checkout@v2
176
+ # with:
177
+ # repository: NVIDIA/NeMo
178
+ # path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
179
+
180
+ # - name: Run some checks
181
+ # run: |
182
+ # docker run \
183
+ # --rm \
184
+ # --device=/dev/nvidia0 \
185
+ # --gpus all \
186
+ # --shm-size=8g \
187
+ # --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
188
+ # --env TRANSFORMERS_OFFLINE=0 \
189
+ # --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
190
+ # # PyTorch Lightning version
191
+ # python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
192
+
193
+ # # PyTorch Lightning DDP Checks
194
+ # CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
195
+
196
+ # # Basic Import Checks
197
+ # python tests/core_ptl/check_imports.py --domain asr
198
+ # python tests/core_ptl/check_imports.py --domain nlp
199
+ # python tests/core_ptl/check_imports.py --domain tts
200
+ # '
201
+
202
+ # L0_Setup_Test_Data_And_Models:
203
+ # needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
204
+ # runs-on: self-hosted-azure
205
+ # if: |
206
+ # needs.pre-flight.outputs.test_to_run != '[]'
207
+ # && (
208
+ # success()
209
+ # || (
210
+ # needs.cicd-wait-in-queue.result == 'skipped'
211
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
212
+ # )
213
+ # )
214
+ # && !cancelled()
215
+ # steps:
216
+ # - name: Checkout
217
+ # uses: actions/checkout@v4
218
+ # with:
219
+ # path: ${{ github.run_id }}
220
+
221
+ # - name: main
222
+ # uses: NVIDIA/NeMo/.github/actions/test-template@main
223
+ # with:
224
+ # runner: ${{ runner.name }}
225
+ # script: L0_Setup_Test_Data_And_Models
226
+ # tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
227
+
228
+ # cicd-main-unit-tests:
229
+ # needs: [pre-flight, cicd-test-container-build]
230
+ # uses: ./.github/workflows/cicd-main-unit-tests.yml
231
+ # if: |
232
+ # needs.pre-flight.outputs.test_to_run != '[]'
233
+ # && (
234
+ # success()
235
+ # || (
236
+ # needs.cicd-wait-in-queue.result == 'skipped'
237
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
238
+ # )
239
+ # )
240
+ # && !cancelled()
241
+ # with:
242
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
243
+
244
+ # cicd-main-export-deploy:
245
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
246
+ # uses: ./.github/workflows/cicd-main-export-deploy.yml
247
+ # if: |
248
+ # (
249
+ # needs.pre-flight.outputs.test_to_run != '[]'
250
+ # && (
251
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy')
252
+ # )
253
+ # )
254
+ # && (
255
+ # success()
256
+ # || (
257
+ # needs.cicd-wait-in-queue.result == 'skipped'
258
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
259
+ # )
260
+ # )
261
+ # && !cancelled()
262
+ # with:
263
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
264
+
265
+ # cicd-main-speech:
266
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
267
+ # uses: ./.github/workflows/cicd-main-speech.yml
268
+ # if: |
269
+ # (
270
+ # needs.pre-flight.outputs.test_to_run != '[]'
271
+ # && (
272
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
273
+ # )
274
+ # )
275
+ # && (
276
+ # success()
277
+ # || (
278
+ # needs.cicd-wait-in-queue.result == 'skipped'
279
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
280
+ # )
281
+ # )
282
+ # && !cancelled()
283
+ # with:
284
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
285
+
286
+ # cicd-main-automodel:
287
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
288
+ # uses: ./.github/workflows/cicd-main-automodel.yml
289
+ # if: |
290
+ # (
291
+ # needs.pre-flight.outputs.test_to_run != '[]'
292
+ # && (
293
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel')
294
+ # )
295
+ # )
296
+ # && (
297
+ # success()
298
+ # || (
299
+ # needs.cicd-wait-in-queue.result == 'skipped'
300
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
301
+ # )
302
+ # )
303
+ # && !cancelled()
304
+ # with:
305
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
306
+
307
+ # cicd-main-nemo2:
308
+ # needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
309
+ # uses: ./.github/workflows/cicd-main-nemo2.yml
310
+ # if: |
311
+ # (
312
+ # needs.pre-flight.outputs.test_to_run != '[]'
313
+ # && (
314
+ # contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
315
+ # || needs.pre-flight.outputs.components_to_run == '["all"]'
316
+ # )
317
+ # )
318
+ # && (
319
+ # success()
320
+ # || (
321
+ # needs.cicd-wait-in-queue.result == 'skipped'
322
+ # && needs.pre-flight.outputs.is_ci_workload == 'true'
323
+ # )
324
+ # )
325
+ # && !cancelled()
326
+ # with:
327
+ # test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
328
+
329
+ Nemo_CICD_Test_Debug:
330
+ needs:
331
+ - pre-flight
332
+ - cicd-test-container-build
333
+ # - cicd-import-tests
334
+ # - L0_Setup_Test_Data_And_Models
335
+ # - cicd-main-unit-tests
336
+ # - cicd-main-nemo2
337
+ # - cicd-main-export-deploy
338
+ # - cicd-main-automodel
339
+ # - cicd-main-speech
340
+ if: always()
341
+ runs-on: ubuntu-latest
342
+ permissions: write-all
343
+ steps:
344
+ - name: Checkout
345
+ uses: actions/checkout@v4
346
+
347
+ - name: Get workflow result
348
+ id: result
349
+ env:
350
+ GH_TOKEN: ${{ github.token }}
351
+ RUN_ID: ${{ github.run_id }}
352
+
353
+ run: |
354
+ # Get workflow run details and check job conclusions
355
+ NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
356
+ NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
357
+
358
+ if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 ]]; then
359
+ RESULT="success"
360
+ else
361
+ RESULT="failure"
362
+ fi
363
+
364
+ # Output the final status
365
+ echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
366
+
367
+ - name: Checkout for GH CLI
368
+ uses: actions/checkout@v4
369
+
370
+ - name: Remove label if not cancelled
371
+ if: ${{ steps.result.outputs.code != 'cancelled' && github.event.label.name == 'Run CICD' && github.event.pull_request.head.repo.full_name == github.repository }}
372
+ env:
373
+ GH_TOKEN: ${{ github.token }}
374
+ PR_NUMBER: ${{ github.event.number }}
375
+ run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
376
+
377
+ - name: Pipeline successful, add PR comment
378
+ if: ${{ always() && steps.result.outputs.code == 'success' && github.event_name == 'pull_request' && env.SLACK_WEBHOOK != '' }}
379
+ uses: peter-evans/create-or-update-comment@v4
380
+ env:
381
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
382
+ REPOSITORY: ${{ github.repository }}
383
+ RUN_ID: ${{ github.run_id }}
384
+ with:
385
+ issue-number: ${{ github.event.number }}
386
+ body: |
387
+ [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
388
+
389
+ We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
390
+
391
+ So it might be time to merge this PR or get some approvals.
392
+
393
+ Due to a major CI change, merges are currently handled by the automation team.
394
+ We will reach out to you quickly to merge this PR, but you can always reach us with the following handles:
395
+
396
+ //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
397
+
398
+ - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
399
+ if: ${{ always() && steps.result.outputs.code == 'failure' && env.SLACK_WEBHOOK != '' }}
400
+ env:
401
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
402
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
403
+ REPOSITORY: ${{ github.repository }}
404
+ RUN_ID: ${{ github.run_id }}
405
+ PR_NUMBER: ${{ github.event.number }}
406
+ SERVER_URL: ${{ github.server_url }}
407
+ run: |
408
+ set -x
409
+ pip install PyGithub
410
+ export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
411
+
412
+ python .github/scripts/notify.py
413
+
414
+ - name: Exit
415
+ if: ${{ always() }}
416
+ env:
417
+ RESULT: ${{ steps.result.outputs.code }}
418
+ run: |
419
+ if [ $RESULT == "success" ]; then
420
+ exit 0
421
+ else
422
+ exit 1
423
+ fi
424
+
425
+ Coverage:
426
+ runs-on: ubuntu-latest
427
+ needs: [Nemo_CICD_Test_Debug]
428
+ strategy:
429
+ matrix:
430
+ flag: [unit-test, e2e]
431
+ if: |
432
+ (
433
+ success()
434
+ || needs.Nemo_CICD_Test.result == 'success'
435
+ )
436
+ && !cancelled()
437
+ steps:
438
+ - name: Checkout
439
+ uses: actions/checkout@v4
440
+
441
+ - name: Download coverage reports of current branch
442
+ uses: actions/download-artifact@v4
443
+ with:
444
+ pattern: coverage-${{ matrix.flag }}-*
445
+
446
+ - name: Get total coverage of current branch
447
+ shell: bash -x -e -u -o pipefail {0}
448
+ if: always()
449
+ run: |
450
+ pip install coverage
451
+
452
+ ls -al .
453
+ ls -al coverage-*/
454
+ coverage combine --keep $(ls coverage-*/.coverage)
455
+ coverage report -i
456
+ rm -rf coverage-*
457
+ ls -al
458
+
459
+ - name: Upload coverage reports to Codecov
460
+ uses: codecov/codecov-action@v5
461
+ with:
462
+ token: ${{ secrets.CODECOV_TOKEN }}
463
+ verbose: true
464
+ flags: ${{ matrix.flag }}
465
+
466
+ - name: Upload artifacts
467
+ uses: actions/upload-artifact@v4
468
+ with:
469
+ name: coverage-${{ matrix.flag }}-aggregated
470
+ path: |
471
+ .coverage
472
+ include-hidden-files: true
.github/workflows/cicd-main-unit-tests.yml ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: NeMo Unit Tests
15
+ on:
16
+ workflow_call:
17
+ inputs:
18
+ test_to_run:
19
+ required: true
20
+ type: string
21
+
22
+ jobs:
23
+ collections-common-tests:
24
+ strategy:
25
+ fail-fast: false
26
+ matrix:
27
+ include:
28
+ - script: L0_Unit_Tests_GPU_Common
29
+ runner: self-hosted-azure-gpus-1
30
+ - script: L0_Unit_Tests_CPU_Common
31
+ runner: self-hosted-azure-cpu
32
+ cpu-only: true
33
+ runs-on: ${{ matrix.runner }}
34
+ name: ${{ matrix.script }}
35
+ steps:
36
+ - name: Checkout
37
+ uses: actions/checkout@v4
38
+ with:
39
+ path: ${{ github.run_id }}
40
+ - name: main
41
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
42
+ with:
43
+ runner: ${{ runner.name }}
44
+ script: ${{ matrix.script }}
45
+ is_unit_test: true
46
+ tests_to_run: ${{ inputs.test_to_run }}
47
+ cpu-only: ${{ matrix.cpu-only || false }}
48
+
49
+ collections-llm-tests:
50
+ strategy:
51
+ fail-fast: false
52
+ matrix:
53
+ include:
54
+ - script: L0_Unit_Tests_GPU_LLM
55
+ runner: self-hosted-azure-gpus-1
56
+ - script: L0_Unit_Tests_CPU_LLM
57
+ runner: self-hosted-azure-cpu
58
+ cpu-only: true
59
+ runs-on: ${{ matrix.runner }}
60
+ name: ${{ matrix.script }}
61
+ steps:
62
+ - name: Checkout
63
+ uses: actions/checkout@v4
64
+ with:
65
+ path: ${{ github.run_id }}
66
+ - name: main
67
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
68
+ with:
69
+ runner: ${{ runner.name }}
70
+ script: ${{ matrix.script }}
71
+ is_unit_test: true
72
+ tests_to_run: ${{ inputs.test_to_run }}
73
+ cpu-only: ${{ matrix.cpu-only || false }}
74
+ is_optional: ${{ matrix.is-optional || false }}
75
+
76
+ collections-multimodal-tests:
77
+ strategy:
78
+ fail-fast: false
79
+ matrix:
80
+ include:
81
+ - script: L0_Unit_Tests_GPU_Multimodal
82
+ runner: self-hosted-azure-gpus-1
83
+ - script: L0_Unit_Tests_CPU_Multimodal
84
+ runner: self-hosted-azure-cpu
85
+ cpu-only: true
86
+ runs-on: ${{ matrix.runner }}
87
+ name: ${{ matrix.script }}
88
+ steps:
89
+ - name: Checkout
90
+ uses: actions/checkout@v4
91
+ with:
92
+ path: ${{ github.run_id }}
93
+ - name: main
94
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
95
+ with:
96
+ runner: ${{ runner.name }}
97
+ script: ${{ matrix.script }}
98
+ is_unit_test: true
99
+ tests_to_run: ${{ inputs.test_to_run }}
100
+ cpu-only: ${{ matrix.cpu-only || false }}
101
+ is_optional: ${{ matrix.is-optional || false }}
102
+ collections-vlm-tests:
103
+ strategy:
104
+ fail-fast: false
105
+ matrix:
106
+ include:
107
+ - script: L0_Unit_Tests_GPU_VLM
108
+ runner: self-hosted-azure-gpus-1
109
+ - script: L0_Unit_Tests_CPU_VLM
110
+ runner: self-hosted-azure-cpu
111
+ cpu-only: true
112
+ runs-on: ${{ matrix.runner }}
113
+ name: ${{ matrix.script }}
114
+ steps:
115
+ - name: Checkout
116
+ uses: actions/checkout@v4
117
+ with:
118
+ path: ${{ github.run_id }}
119
+ - name: main
120
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
121
+ with:
122
+ runner: ${{ runner.name }}
123
+ script: ${{ matrix.script }}
124
+ is_unit_test: true
125
+ tests_to_run: ${{ inputs.test_to_run }}
126
+ cpu-only: ${{ matrix.cpu-only || false }}
127
+ is_optional: ${{ matrix.is-optional || false }}
128
+
129
+ core-tests:
130
+ strategy:
131
+ fail-fast: false
132
+ matrix:
133
+ include:
134
+ - script: L0_Unit_Tests_GPU_Core
135
+ runner: self-hosted-azure-gpus-1
136
+ - script: L0_Unit_Tests_CPU_Core
137
+ runner: self-hosted-azure-cpu
138
+ cpu-only: true
139
+ - script: L0_Unit_Tests_GPU_Hydra
140
+ runner: self-hosted-azure-gpus-1
141
+ - script: L0_Unit_Tests_CPU_Hydra
142
+ runner: self-hosted-azure-cpu
143
+ cpu-only: true
144
+ runs-on: ${{ matrix.runner }}
145
+ name: ${{ matrix.script }}
146
+ steps:
147
+ - name: Checkout
148
+ uses: actions/checkout@v4
149
+ with:
150
+ path: ${{ github.run_id }}
151
+ - name: main
152
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
153
+ with:
154
+ runner: ${{ runner.name }}
155
+ script: ${{ matrix.script }}
156
+ is_unit_test: true
157
+ tests_to_run: ${{ inputs.test_to_run }}
158
+ cpu-only: ${{ matrix.cpu-only || false }}
159
+
160
+ lightning-tests:
161
+ strategy:
162
+ fail-fast: false
163
+ matrix:
164
+ include:
165
+ - script: L0_Unit_Tests_GPU_Lightning
166
+ runner: self-hosted-azure
167
+ - script: L0_Unit_Tests_CPU_Lightning
168
+ runner: self-hosted-azure-cpu
169
+ cpu-only: true
170
+ runs-on: ${{ matrix.runner }}
171
+ name: ${{ matrix.script }}
172
+ steps:
173
+ - name: Checkout
174
+ uses: actions/checkout@v4
175
+ with:
176
+ path: ${{ github.run_id }}
177
+ - name: main
178
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
179
+ with:
180
+ runner: ${{ runner.name }}
181
+ script: ${{ matrix.script }}
182
+ is_unit_test: true
183
+ tests_to_run: ${{ inputs.test_to_run }}
184
+ cpu-only: ${{ matrix.cpu-only || false }}
185
+ is_optional: ${{ matrix.is-optional || false }}
186
+
187
+ other-tests:
188
+ strategy:
189
+ fail-fast: false
190
+ matrix:
191
+ include:
192
+ - script: L0_Unit_Tests_GPU_Others
193
+ runner: self-hosted-azure-gpus-1
194
+ - script: L0_Unit_Tests_CPU_Others
195
+ runner: self-hosted-azure-cpu
196
+ cpu-only: true
197
+ runs-on: ${{ matrix.runner }}
198
+ name: ${{ matrix.script }}
199
+ steps:
200
+ - name: Checkout
201
+ uses: actions/checkout@v4
202
+ with:
203
+ path: ${{ github.run_id }}
204
+ - name: main
205
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
206
+ with:
207
+ runner: ${{ runner.name }}
208
+ script: ${{ matrix.script }}
209
+ is_unit_test: true
210
+ tests_to_run: ${{ inputs.test_to_run }}
211
+ cpu-only: ${{ matrix.cpu-only || false }}
212
+ is_optional: ${{ matrix.is-optional || false }}
.github/workflows/cicd-main.yml ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2025, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: CICD NeMo
15
+ on:
16
+ schedule:
17
+ - cron: 0 0 * * *
18
+ pull_request:
19
+ branches:
20
+ - main
21
+ - r**
22
+ - weekly-bump*
23
+ types: [labeled]
24
+ workflow_dispatch:
25
+ inputs:
26
+ test_to_run:
27
+ required: false
28
+ default: all
29
+ type: string
30
+ description: Comma-separated list of tests to run. Use "all" to run the full test suite.
31
+
32
+ concurrency:
33
+ # group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_id || github.event.pull_request.number || github.ref }}-${{ github.event_name }}
34
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
35
+ cancel-in-progress: true
36
+
37
+ jobs:
38
+ pre-flight:
39
+ runs-on: ubuntu-latest
40
+ outputs:
41
+ test_to_run: ${{ steps.test_to_run.outputs.main }}
42
+ is_ci_workload: ${{ steps.is_ci_workload.outputs.main }}
43
+ no_fail_fast: ${{ steps.no_fail_fast.outputs.main }}
44
+ components_to_run: ${{ steps.components_to_run.outputs.main }}
45
+ env:
46
+ TESTS_TO_RUN: ${{ inputs.test_to_run }}
47
+ EVENT_NAME: ${{ github.event_name }}
48
+ HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
49
+ steps:
50
+ - name: Checkout branch
51
+ uses: actions/checkout@v4
52
+ with:
53
+ fetch-depth: 0
54
+
55
+ - name: Select components to run
56
+ id: components_to_run
57
+ run: |
58
+ pip install -U pip
59
+ pip install git-python
60
+
61
+ if [[ "$EVENT_NAME" == "pull_request" ]]; then
62
+ python .github/scripts/components_to_run.py --source-sha ${{ github.event.pull_request.head.sha }} --target-sha ${{ github.event.pull_request.base.sha }}
63
+ else
64
+ echo '["nemo2", "automodel", "export-deploy", "speech"]' | tee -a test_modules.json
65
+ fi
66
+
67
+ components_to_run=$(cat test_modules.json)
68
+
69
+ echo "main=${components_to_run}" | tee -a "$GITHUB_OUTPUT"
70
+
71
+ - name: Select tests to run
72
+ id: test_to_run
73
+ run: |
74
+ # For manual dispatch, we replace `all` with the actual job names
75
+ if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
76
+ TESTS_TO_RUN=$TESTS_TO_RUN
77
+
78
+ # For correctly labeled PR, we replace `all` with the actual job names
79
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
80
+ TESTS_TO_RUN=all
81
+
82
+ # For incorrectly labeled PR, run no tests
83
+ elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
84
+ TESTS_TO_RUN=""
85
+
86
+ # For push events, run all tests. This is so that we can generate coverage
87
+ # on branch `main`.
88
+ elif [[ "$EVENT_NAME" == "push" || "$EVENT_NAME" == "schedule" ]]; then
89
+ TESTS_TO_RUN=all
90
+
91
+ else
92
+ echo "Unsupported event_name $EVENT_NAME provided".
93
+ exit 1
94
+ fi
95
+
96
+ parsed_string=$(echo "$TESTS_TO_RUN" | jq -c --raw-input 'split(",")')
97
+ echo "main=${parsed_string}" | tee -a "$GITHUB_OUTPUT"
98
+
99
+ - name: Check if this is a CI workload
100
+ shell: bash
101
+ id: is_ci_workload
102
+ run: |
103
+ branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
104
+
105
+ if [[ "$branch_name" =~ ^bump-ci-container || "$EVENT_NAME" == "schedule" ]]; then
106
+ is_ci_workload=true
107
+ echo "main=true" | tee -a "$GITHUB_OUTPUT"
108
+ else
109
+ is_ci_workload=false
110
+ fi
111
+
112
+ echo "main=$is_ci_workload" | tee -a "$GITHUB_OUTPUT"
113
+
114
+ - name: Check if no-fail-fast is set
115
+ shell: bash
116
+ id: no_fail_fast
117
+ env:
118
+ HAS_FAIL_FAST_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'no-fail-fast') }}
119
+ run: |
120
+ if [[ "$HAS_FAIL_FAST_LABEL" == "true" || "$EVENT_NAME" == "schedule" ]]; then
121
+ no_fail_fast=true
122
+ else
123
+ no_fail_fast=false
124
+ fi
125
+
126
+ echo "main=$no_fail_fast" | tee -a "$GITHUB_OUTPUT"
127
+
128
+ code-linting:
129
+ if: needs.pre-flight.outputs.test_to_run != '[]'
130
+ needs: [pre-flight]
131
+ uses: ./.github/workflows/code-linting.yml
132
+
133
+ cicd-wait-in-queue:
134
+ needs: [pre-flight, code-linting]
135
+ runs-on: ubuntu-latest
136
+ environment: test
137
+ if: |
138
+ needs.pre-flight.outputs.test_to_run != '[]'
139
+ && needs.pre-flight.outputs.components_to_run != '[]'
140
+ && needs.pre-flight.outputs.is_ci_workload == 'false'
141
+ steps:
142
+ - name: Running CI tests
143
+ run: |
144
+ echo "Running CI tests"
145
+
146
+ cicd-test-container-build:
147
+ uses: ./.github/workflows/_build_container.yml
148
+ needs: [pre-flight, code-linting, cicd-wait-in-queue]
149
+ if: |
150
+ needs.pre-flight.outputs.test_to_run != '[]'
151
+ && needs.pre-flight.outputs.components_to_run != '[]'
152
+ && (
153
+ success()
154
+ || (
155
+ needs.cicd-wait-in-queue.result == 'skipped'
156
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
157
+ )
158
+ )
159
+ && !cancelled()
160
+ with:
161
+ image-name: nemo_container
162
+ dockerfile: docker/Dockerfile.ci
163
+
164
+ cicd-import-tests:
165
+ if: |
166
+ needs.pre-flight.outputs.test_to_run != '[]'
167
+ && needs.pre-flight.outputs.components_to_run != '[]'
168
+ && (
169
+ success()
170
+ || (
171
+ needs.cicd-wait-in-queue.result == 'skipped'
172
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
173
+ )
174
+ )
175
+ && !cancelled()
176
+ needs: [cicd-test-container-build, pre-flight]
177
+ runs-on: self-hosted-azure-gpus-1
178
+ steps:
179
+ - name: Create UUID
180
+ id: uuid
181
+ run: |
182
+ echo "id=$(uuidgen)" >> "$GITHUB_OUTPUT"
183
+
184
+ - name: Checkout NeMo
185
+ uses: actions/checkout@v2
186
+ with:
187
+ repository: NVIDIA/NeMo
188
+ path: ${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo
189
+
190
+ - name: Run some checks
191
+ run: |
192
+ docker run \
193
+ --rm \
194
+ --device=/dev/nvidia0 \
195
+ --gpus all \
196
+ --shm-size=8g \
197
+ --volume $(pwd)/${{ github.run_id }}/${{steps.uuid.outputs.id }}/NeMo:/workspace \
198
+ --env TRANSFORMERS_OFFLINE=0 \
199
+ --env HYDRA_FULL_ERROR=1 --env PYTHONUNBUFFERED=1 nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c '\
200
+ # PyTorch Lightning version
201
+ python -c "import lightning.pytorch; print(lightning.pytorch.__version__)"
202
+
203
+ # PyTorch Lightning DDP Checks
204
+ CUDA_VISIBLE_DEVICES="0,1" python "tests/core_ptl/check_for_ranks.py"
205
+
206
+ # Basic Import Checks
207
+ python tests/core_ptl/check_imports.py --domain asr
208
+ python tests/core_ptl/check_imports.py --domain nlp
209
+ python tests/core_ptl/check_imports.py --domain tts
210
+ '
211
+
212
+ L0_Setup_Test_Data_And_Models:
213
+ needs: [pre-flight, cicd-test-container-build, cicd-wait-in-queue]
214
+ runs-on: self-hosted-azure
215
+ if: |
216
+ needs.pre-flight.outputs.test_to_run != '[]'
217
+ && needs.pre-flight.outputs.components_to_run != '[]'
218
+ && (
219
+ success()
220
+ || (
221
+ needs.cicd-wait-in-queue.result == 'skipped'
222
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
223
+ )
224
+ )
225
+ && !cancelled()
226
+ steps:
227
+ - name: Checkout
228
+ uses: actions/checkout@v4
229
+ with:
230
+ path: ${{ github.run_id }}
231
+
232
+ - name: main
233
+ uses: NVIDIA/NeMo/.github/actions/test-template@main
234
+ with:
235
+ runner: ${{ runner.name }}
236
+ script: L0_Setup_Test_Data_And_Models
237
+ tests_to_run: '["L0_Setup_Test_Data_And_Models"]'
238
+
239
+ cicd-main-unit-tests:
240
+ needs: [pre-flight, cicd-test-container-build]
241
+ uses: ./.github/workflows/cicd-main-unit-tests.yml
242
+ if: |
243
+ needs.pre-flight.outputs.test_to_run != '[]'
244
+ && needs.pre-flight.outputs.components_to_run != '[]'
245
+ && (
246
+ success()
247
+ || (
248
+ needs.cicd-wait-in-queue.result == 'skipped'
249
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
250
+ )
251
+ )
252
+ && !cancelled()
253
+ with:
254
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
255
+
256
+ cicd-main-export-deploy:
257
+ needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
258
+ uses: ./.github/workflows/cicd-main-export-deploy.yml
259
+ if: |
260
+ (
261
+ needs.pre-flight.outputs.test_to_run != '[]'
262
+ && (
263
+ contains(fromJson(needs.pre-flight.outputs.components_to_run), 'export-deploy')
264
+ )
265
+ )
266
+ && (
267
+ success()
268
+ || (
269
+ needs.cicd-wait-in-queue.result == 'skipped'
270
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
271
+ )
272
+ )
273
+ && !cancelled()
274
+ with:
275
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
276
+
277
+ cicd-main-speech:
278
+ needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
279
+ uses: ./.github/workflows/cicd-main-speech.yml
280
+ if: |
281
+ (
282
+ needs.pre-flight.outputs.test_to_run != '[]'
283
+ && (
284
+ contains(fromJson(needs.pre-flight.outputs.components_to_run), 'speech')
285
+ )
286
+ )
287
+ && (
288
+ success()
289
+ || (
290
+ needs.cicd-wait-in-queue.result == 'skipped'
291
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
292
+ )
293
+ )
294
+ && !cancelled()
295
+ with:
296
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
297
+
298
+ cicd-main-automodel:
299
+ needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
300
+ uses: ./.github/workflows/cicd-main-automodel.yml
301
+ if: |
302
+ (
303
+ needs.pre-flight.outputs.test_to_run != '[]'
304
+ && (
305
+ contains(fromJson(needs.pre-flight.outputs.components_to_run), 'automodel')
306
+ )
307
+ )
308
+ && (
309
+ success()
310
+ || (
311
+ needs.cicd-wait-in-queue.result == 'skipped'
312
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
313
+ )
314
+ )
315
+ && !cancelled()
316
+ with:
317
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
318
+
319
+ cicd-main-nemo2:
320
+ needs: [pre-flight, cicd-test-container-build, cicd-main-unit-tests]
321
+ uses: ./.github/workflows/cicd-main-nemo2.yml
322
+ if: |
323
+ (
324
+ needs.pre-flight.outputs.test_to_run != '[]'
325
+ && (
326
+ contains(fromJson(needs.pre-flight.outputs.components_to_run), 'nemo2')
327
+ || needs.pre-flight.outputs.components_to_run == '["all"]'
328
+ )
329
+ )
330
+ && (
331
+ success()
332
+ || (
333
+ needs.cicd-wait-in-queue.result == 'skipped'
334
+ && needs.pre-flight.outputs.is_ci_workload == 'true'
335
+ )
336
+ )
337
+ && !cancelled()
338
+ with:
339
+ test_to_run: ${{ needs.pre-flight.outputs.test_to_run }}
340
+
341
+ Nemo_CICD_Test:
342
+ needs:
343
+ - pre-flight
344
+ - cicd-test-container-build
345
+ - cicd-import-tests
346
+ - L0_Setup_Test_Data_And_Models
347
+ - cicd-main-unit-tests
348
+ - cicd-main-nemo2
349
+ - cicd-main-export-deploy
350
+ - cicd-main-automodel
351
+ - cicd-main-speech
352
+ if: always()
353
+ runs-on: ubuntu-latest
354
+ permissions: write-all
355
+ steps:
356
+ - name: Checkout
357
+ uses: actions/checkout@v4
358
+
359
+ - name: Get workflow result
360
+ id: result
361
+ env:
362
+ GH_TOKEN: ${{ github.token }}
363
+ RUN_ID: ${{ github.run_id }}
364
+ HAS_LABEL: ${{ github.event.label.name == 'Run CICD' }}
365
+ IS_SCHEDULED: ${{ github.event_name == 'schedule' }}
366
+ run: |
367
+ # Get workflow run details and check job conclusions
368
+ LATEST_ATTEMPT=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion != null) | .conclusion] | last')
369
+ NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
370
+ NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
371
+
372
+ if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && ("$HAS_LABEL" == "true" || "$IS_SCHEDULED" == "true") ]]; then
373
+ RESULT="success"
374
+ elif [[ $NUM_CANCELLED -gt 0 ]]; then
375
+ RESULT="cancelled"
376
+ else
377
+ RESULT="failure"
378
+ fi
379
+
380
+ # Output the final status
381
+ echo "code=$RESULT" | tee -a $GITHUB_OUTPUT
382
+
383
+ - name: Checkout for GH CLI
384
+ uses: actions/checkout@v4
385
+
386
+ - name: Remove label if not cancelled
387
+ if: |
388
+ steps.result.outputs.code != 'cancelled'
389
+ && github.event.label.name == 'Run CICD'
390
+ && github.event.pull_request.head.repo.full_name == github.repository
391
+ env:
392
+ GH_TOKEN: ${{ github.token }}
393
+ PR_NUMBER: ${{ github.event.number }}
394
+ run: gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
395
+
396
+ - name: Pipeline successful, add PR comment
397
+ if: |
398
+ steps.result.outputs.code == 'success'
399
+ && github.event_name == 'pull_request'
400
+ && env.SLACK_WEBHOOK != ''
401
+ uses: peter-evans/create-or-update-comment@v4
402
+ env:
403
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
404
+ REPOSITORY: ${{ github.repository }}
405
+ RUN_ID: ${{ github.run_id }}
406
+ with:
407
+ issue-number: ${{ github.event.number }}
408
+ body: |
409
+ [🤖]: Hi @${{ github.event.pull_request.user.login }} 👋,
410
+
411
+ We wanted to let you know that a [CICD pipeline](https://github.com/${{ env.REPOSITORY }}/actions/runs/${{ env.RUN_ID }}) for this PR just finished successfully.
412
+
413
+ So it might be time to merge this PR or get some approvals.
414
+
415
+ //cc @chtruong814 @ko3n1g @pablo-garay @thomasdhc
416
+
417
+ - name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
418
+ if: |
419
+ steps.result.outputs.code == 'failure'
420
+ && github.event.label.name == 'Run CICD'
421
+ && env.SLACK_WEBHOOK != ''
422
+ env:
423
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
424
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
425
+ REPOSITORY: ${{ github.repository }}
426
+ RUN_ID: ${{ github.run_id }}
427
+ PR_NUMBER: ${{ github.event.number }}
428
+ SERVER_URL: ${{ github.server_url }}
429
+ run: |
430
+ set -x
431
+ pip install PyGithub
432
+ export BRANCH_NAME=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}
433
+
434
+ python .github/scripts/notify.py
435
+
436
+ - name: Exit
437
+ if: ${{ always() }}
438
+ env:
439
+ RESULT: ${{ steps.result.outputs.code }}
440
+ run: |
441
+ if [ $RESULT == "success" ]; then
442
+ exit 0
443
+ else
444
+ exit 1
445
+ fi
446
+
447
+ Coverage:
448
+ runs-on: ubuntu-latest
449
+ needs: [pre-flight, Nemo_CICD_Test]
450
+ if: |
451
+ needs.pre-flight.outputs.test_to_run != '[]'
452
+ && needs.pre-flight.outputs.components_to_run != '[]'
453
+ && (
454
+ success()
455
+ || needs.Nemo_CICD_Test.result == 'success'
456
+ )
457
+ && !cancelled()
458
+ strategy:
459
+ matrix:
460
+ flag: [unit-test, e2e]
461
+ steps:
462
+ - name: Checkout
463
+ uses: actions/checkout@v4
464
+
465
+ - name: Download coverage reports of current branch
466
+ uses: actions/download-artifact@v4
467
+ with:
468
+ pattern: coverage-${{ matrix.flag }}-*
469
+
470
+ - name: Get total coverage of current branch
471
+ shell: bash -x -e -u -o pipefail {0}
472
+ if: always()
473
+ run: |
474
+ pip install coverage
475
+
476
+ ls -al .
477
+ ls -al coverage-*/
478
+ coverage combine --keep $(ls coverage-*/.coverage)
479
+ coverage report -i
480
+ rm -rf coverage-*
481
+ ls -al
482
+
483
+ - name: Upload coverage reports to Codecov
484
+ uses: codecov/codecov-action@v5
485
+ with:
486
+ token: ${{ secrets.CODECOV_TOKEN }}
487
+ verbose: true
488
+ flags: ${{ matrix.flag }}
489
+
490
+ - name: Upload artifacts
491
+ uses: actions/upload-artifact@v4
492
+ with:
493
+ name: coverage-${{ matrix.flag }}-aggregated
494
+ path: |
495
+ .coverage
496
+ include-hidden-files: true
.github/workflows/cicd-relabel-bot.yml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # If the PR get's updated by a new commit, it prevents auto-merges
2
+ # since there's no CI event attached to the commit anymore.
3
+ # This workflow re-attaches the label after a push, if the PR
4
+ # was already labeled prior to the push.
5
+ name: CICD Relabel bot
6
+
7
+ on:
8
+ pull_request_target:
9
+
10
+ jobs:
11
+ relabel:
12
+ runs-on: ubuntu-latest
13
+ env:
14
+ PR_NUMBER: ${{ github.event.number }}
15
+ GH_TOKEN: ${{ secrets.NEMO_RELABEL_TOKEN }}
16
+ HOSTNAME: ${{ github.server_url }}
17
+ permissions: write-all
18
+ steps:
19
+ - name: Checkout repo
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Check if PR was already labeled with `Run CICD`
23
+ id: pre-flight
24
+ run: |
25
+ LABELS=$(gh pr view "$PR_NUMBER" --json labels)
26
+ HAS_LABEL=$(echo $LABELS \
27
+ | jq '[.labels[].name] | any(. == "Run CICD")'
28
+ )
29
+
30
+ echo "has-label=$HAS_LABEL" | tee -a "$GITHUB_OUTPUT"
31
+
32
+ - name: Relabel PR
33
+ if: ${{ steps.pre-flight.outputs.has-label == 'true' }}
34
+ run: |
35
+ gh pr edit "$PR_NUMBER" --remove-label "Run CICD"
36
+ gh pr edit "$PR_NUMBER" --add-label "Run CICD"
.github/workflows/close-inactive-issue-pr.yml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Stale-Close-Inactive-Issues-PRs
2
+ on:
3
+ schedule:
4
+ - cron: "30 1 * * *"
5
+
6
+ jobs:
7
+ close-issues:
8
+ runs-on: ubuntu-latest
9
+ permissions:
10
+ issues: write
11
+ pull-requests: write
12
+ steps:
13
+ - uses: actions/stale@v6
14
+ with:
15
+ operations-per-run: 100
16
+ days-before-issue-stale: 30
17
+ days-before-issue-close: 7
18
+ stale-issue-label: "stale"
19
+ stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. Remove stale label or comment or this will be closed in 7 days."
20
+ close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
21
+ days-before-pr-stale: 14
22
+ days-before-pr-close: 7
23
+ stale-pr-message: "This PR is stale because it has been open for 14 days with no activity. Remove stale label or comment or update or this will be closed in 7 days."
24
+ close-pr-message: "This PR was closed because it has been inactive for 7 days since being marked as stale."
25
+ repo-token: ${{ secrets.GITHUB_TOKEN }}
.github/workflows/code-formatting.yml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Isort and Black Formatting
2
+ # Incrementally reformat only changed files with black, all files with isort
3
+ #
4
+ # Replaces pre-commit.ci, since it reformats all the files.
5
+ # See issue https://github.com/pre-commit-ci/issues/issues/90
6
+ #
7
+ # The action requires a custom token to trigger workflow after pushing reformatted files back to the branch.
8
+ # `secrets.GITHUB_TOKEN` can be used instead, but this will result
9
+ # in not running necessary checks after reformatting, which is undesirable.
10
+ # For details see https://github.com/orgs/community/discussions/25702
11
+
12
+ on:
13
+ pull_request_target:
14
+ paths:
15
+ - "**.py"
16
+ types: [opened, synchronize, reopened, labeled, unlabeled]
17
+
18
+ defaults:
19
+ run:
20
+ shell: bash -x -e -u -o pipefail {0}
21
+
22
+ jobs:
23
+ reformat_with_isort_and_black:
24
+ runs-on: ubuntu-latest
25
+ permissions:
26
+ # write permissions required to commit changes
27
+ contents: write
28
+ steps:
29
+ - name: Checkout branch
30
+ uses: actions/checkout@v4
31
+ with:
32
+ # setup repository and ref for PRs, see
33
+ # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
34
+ repository: ${{ github.event.pull_request.head.repo.full_name }}
35
+ ref: ${{ github.event.pull_request.head.ref }}
36
+ # custom token is required to trigger actions after reformatting + pushing
37
+ token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
38
+ fetch-depth: 0
39
+
40
+ - name: Get changed files
41
+ id: changed-files
42
+ uses: step-security/changed-files@v45.0.1
43
+ with:
44
+ files: |
45
+ **.py
46
+
47
+ - name: Setup Python env
48
+ uses: actions/setup-python@v5
49
+ with:
50
+ python-version: "3.10"
51
+
52
+ - name: black
53
+ uses: psf/black@stable
54
+ if: ${{ steps.changed-files.outputs.any_changed == 'true' }}
55
+ with:
56
+ options: "--verbose"
57
+ # apply only to changed files (pass explicitly the files)
58
+ src: "${{ steps.changed-files.outputs.all_changed_files }}"
59
+ version: "~= 24.3"
60
+
61
+ - name: isort
62
+ uses: isort/isort-action@v1
63
+ if: ${{ steps.changed-files.outputs.any_changed == 'true' }}
64
+ with:
65
+ isort-version: "5.13.2"
66
+ # reformat all files with isort – safe since the whole repo is already reformatted
67
+ configuration: ""
68
+
69
+ - uses: EndBug/add-and-commit@v9
70
+ # Commit changes. Nothing is committed if no changes.
71
+ with:
72
+ message: Apply isort and black reformatting
73
+ commit: --signoff
.github/workflows/code-init-file-checker.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Check __init__ files
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+
7
+ jobs:
8
+ check-init-files:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - name: Checkout
12
+ uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v4
16
+ with:
17
+ python-version: "3.11"
18
+
19
+ - name: Install init-file-checker
20
+ run: pip install init-file-checker
21
+
22
+ - name: Run init-file-checker
23
+ run: init-file-checker nemo/
.github/workflows/code-linting.yml ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: PyLint and flake8 linting
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened, labeled, unlabeled]
6
+ workflow_call:
7
+
8
+ jobs:
9
+ linting:
10
+ name: "Domain: ${{ matrix.domain }}"
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ fail-fast: false
14
+ matrix:
15
+ domain: [speech, other]
16
+ env:
17
+ DOMAIN: ${{ matrix.domain }}
18
+ steps:
19
+ - name: Checkout
20
+ uses: actions/checkout@v4
21
+
22
+ - name: Select filter
23
+ id: filter
24
+ run: |
25
+ if [[ "$DOMAIN" == "speech" ]]; then
26
+ FILTER=$(jq -crn '[
27
+ "nemo/collections/common/data/lhotse/*.py",
28
+ "nemo/collections/asr/**/*.py",
29
+ "nemo/collections/tts/**/*.py",
30
+ "nemo/collections/audio/**/*.py",
31
+ "nemo/collections/multimodal/speech_llm/**/*.py",
32
+ "nemo/collections/speechlm/**/*.py",
33
+ "nemo/collections/speechlm2/**/*.py"
34
+ ] | join(",")')
35
+
36
+ else
37
+ FILTER=$(jq -crn '[
38
+ "nemo/**/*.py",
39
+ "!nemo/collections/common/data/lhotse/*.py",
40
+ "!nemo/collections/asr/**/*.py",
41
+ "!nemo/collections/tts/**/*.py",
42
+ "!nemo/collections/audio/**/*.py",
43
+ "!nemo/collections/multimodal/speech_llm/**/*.py",
44
+ "!nemo/collections/speechlm/**/*.py",
45
+ "!nemo/collections/speechlm2/**/*.py"
46
+ ] | join(",")')
47
+ fi
48
+
49
+ echo "main=$FILTER" | tee -a "$GITHUB_OUTPUT"
50
+
51
+ - name: Get changed files
52
+ id: changed-files
53
+ uses: step-security/changed-files@v45.0.1
54
+ with:
55
+ files: ${{ steps.filter.outputs.main }}
56
+ files_separator: ","
57
+ separator: " "
58
+
59
+ - name: Run PyLint
60
+ id: pylint
61
+ env:
62
+ CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
63
+ SKIP_DOCS: ${{ contains(github.event.pull_request.labels.*.name, 'skip-docs') }}
64
+ SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
65
+ run: |
66
+ if [[ -z "$CHANGED_FILES" ]]; then
67
+ echo Nothing to lint.
68
+ echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
69
+ exit 0
70
+ fi
71
+
72
+ if [[ $SKIP_DOCS == true ]]; then
73
+ ADDITIONAL_PYLINT_ARGS="--disable=C0115,C0116"
74
+ else
75
+ ADDITIONAL_PYLINT_ARGS=""
76
+ fi
77
+
78
+ if [[ $SKIP_LINTING == true ]]; then
79
+ ADDITIONAL_PYLINT_ARGS="--exit-zero"
80
+ fi
81
+
82
+ pip install pylint
83
+ set +e
84
+ pylint $ADDITIONAL_PYLINT_ARGS --output "pylintrc.$DOMAIN.txt" --rcfile ".pylintrc.$DOMAIN" ${CHANGED_FILES[@]}
85
+ echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
86
+
87
+ - name: Run flake8
88
+ id: flake8
89
+ env:
90
+ CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
91
+ SKIP_LINTING: ${{ contains(github.event.pull_request.labels.*.name, 'skip-linting') }}
92
+ run: |
93
+ if [[ -z "$CHANGED_FILES" ]]; then
94
+ echo Nothing to lint.
95
+ echo "exit-code=0" | tee -a "$GITHUB_OUTPUT"
96
+ exit 0
97
+ fi
98
+
99
+ if [[ $SKIP_LINTING == true ]]; then
100
+ ADDITIONAL_FLAKE8_ARGS="--exit-zero"
101
+ else
102
+ ADDITIONAL_FLAKE8_ARGS=""
103
+ fi
104
+
105
+ pip install flake8
106
+ set +e
107
+ flake8 $ADDITIONAL_FLAKE8_ARGS --output "flake8.$DOMAIN.txt" --config ".flake8.$DOMAIN" ${CHANGED_FILES[@]}
108
+ echo "exit-code=$?" | tee -a "$GITHUB_OUTPUT"
109
+
110
+ - name: Summary
111
+ env:
112
+ PYLINT: ${{ steps.pylint.outputs.exit-code == 0 }}
113
+ FLAKE8: ${{ steps.flake8.outputs.exit-code == 0 }}
114
+ run: |
115
+
116
+ if [[ "$PYLINT" != "true" ]]; then
117
+ echo "Pylint output:" | tee -a $GITHUB_STEP_SUMMARY
118
+
119
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
120
+ cat pylintrc.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
121
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
122
+ fi
123
+
124
+ if [[ "$FLAKE8" != "true" ]]; then
125
+ echo "Flake8 output:" | tee -a $GITHUB_STEP_SUMMARY
126
+
127
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
128
+ cat flake8.$DOMAIN.txt | tee -a $GITHUB_STEP_SUMMARY
129
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
130
+ fi
131
+
132
+ if [[ "$PYLINT" != "true" || "$FLAKE8" != "true" ]]; then
133
+ echo "The following directories got scanned:" | tee -a $GITHUB_STEP_SUMMARY
134
+
135
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
136
+ echo ${{ steps.filter.outputs.main }} | tee -a $GITHUB_STEP_SUMMARY
137
+ echo '```' | tee -a $GITHUB_STEP_SUMMARY
138
+
139
+ exit 1
140
+ fi
141
+
142
+ Nemo_Linting_Test:
143
+ needs: linting
144
+ runs-on: ubuntu-latest
145
+ if: always()
146
+ steps:
147
+ - name: Main
148
+ env:
149
+ RESULTS: ${{ toJson(needs.linting) }}
150
+ run: |
151
+ RESULT=$(echo "$RESULTS" | jq -r '.result')
152
+
153
+ if [[ "$RESULT" == "success" ]]; then
154
+ echo "All passed."
155
+ exit 0
156
+ else
157
+ echo "Some linting domains failed."
158
+ exit 1
159
+ fi
.github/workflows/codeql.yml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # For most projects, this workflow file will not need changing; you simply need
2
+ # to commit it to your repository.
3
+ #
4
+ # You may wish to alter this file to override the set of languages analyzed,
5
+ # or to provide custom queries or build logic.
6
+ #
7
+ # ******** NOTE ********
8
+ # We have attempted to detect the languages in your repository. Please check
9
+ # the `language` matrix defined below to confirm you have the correct set of
10
+ # supported CodeQL languages.
11
+ #
12
+ name: "CodeQL"
13
+
14
+ on:
15
+ push:
16
+ branches: [ "main", "[rv][0-9]*", "gh-pages-src" ]
17
+ pull_request:
18
+ # The branches below must be a subset of the branches above
19
+ branches: [ "main" ]
20
+ schedule:
21
+ - cron: '19 1 * * 4'
22
+
23
+ jobs:
24
+ analyze:
25
+ name: Analyze
26
+ runs-on: ubuntu-latest
27
+ permissions:
28
+ actions: read
29
+ contents: read
30
+ security-events: write
31
+
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ language: [ 'python' ]
36
+ # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37
+ # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38
+
39
+ steps:
40
+ - name: Checkout repository
41
+ uses: actions/checkout@v3
42
+
43
+ # Initializes the CodeQL tools for scanning.
44
+ - name: Initialize CodeQL
45
+ uses: github/codeql-action/init@v2
46
+ with:
47
+ languages: ${{ matrix.language }}
48
+ # If you wish to specify custom queries, you can do so here or in a config file.
49
+ # By default, queries listed here will override any specified in a config file.
50
+ # Prefix the list here with "+" to use these queries and those in the config file.
51
+
52
+ # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53
+ queries: security-and-quality # security-extended,
54
+ config-file: ./.github/workflows/config/codeql.yml
55
+
56
+
57
+ # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
58
+ # If this step fails, then you should remove it and run the build manually (see below)
59
+ - name: Autobuild
60
+ uses: github/codeql-action/autobuild@v2
61
+
62
+ # ℹ️ Command-line programs to run using the OS shell.
63
+ # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
64
+
65
+ # If the Autobuild fails above, remove it and uncomment the following three lines.
66
+ # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
67
+
68
+ # - run: |
69
+ # echo "Run, Build Application using script"
70
+ # ./location_of_script_within_repo/buildscript.sh
71
+
72
+ - name: Perform CodeQL Analysis
73
+ uses: github/codeql-action/analyze@v2
74
+ with:
75
+ category: "/language:${{matrix.language}}"
.github/workflows/config/changelog-config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "categories": [
3
+ {
4
+ "title": "## ASR\n\n<details><summary>Changelog</summary>",
5
+ "labels": ["asr"],
6
+ "exclude_labels": ["cherry-pick"]
7
+ },
8
+ {
9
+ "title": "</details>\n\n## TTS\n\n<details><summary>Changelog</summary>",
10
+ "labels": ["tts"],
11
+ "exclude_labels": ["cherry-pick"]
12
+ },
13
+ {
14
+ "title": "</details>\n\n## NLP / NMT\n\n<details><summary>Changelog</summary>",
15
+ "labels": ["nlp", "nmt", "megatron"],
16
+ "exclude_labels": ["cherry-pick"]
17
+ },
18
+ {
19
+ "title": "</details>\n\n## Text Normalization / Inverse Text Normalization\n\n<details><summary>Changelog</summary>",
20
+ "labels": ["tn", "itn"],
21
+ "exclude_labels": ["cherry-pick"]
22
+ },
23
+ {
24
+ "title": "</details>\n\n## NeMo Tools\n\n<details><summary>Changelog</summary>",
25
+ "labels": ["tools"],
26
+ "exclude_labels": ["cherry-pick"]
27
+ },
28
+ {
29
+ "title": "</details>\n\n## Export\n\n<details><summary>Changelog</summary>",
30
+ "labels": ["export"],
31
+ "exclude_labels": ["cherry-pick"]
32
+ },
33
+ {
34
+ "title": "</details>\n\n## Documentation\n\n<details><summary>Changelog</summary>",
35
+ "labels": ["docs"],
36
+ "exclude_labels": ["cherry-pick"]
37
+ },
38
+ {
39
+ "title": "</details>\n\n## Bugfixes\n\n<details><summary>Changelog</summary>",
40
+ "labels": ["bug"],
41
+ "exclude_labels": ["cherry-pick"]
42
+ },
43
+ {
44
+ "title": "</details>\n\n## Cherrypick\n\n<details><summary>Changelog</summary>",
45
+ "labels": ["cherry-pick"],
46
+ "exclude_labels": ["cherry-pick"]
47
+ }
48
+ ],
49
+ "ignore_labels": [
50
+ "ignore"
51
+ ],
52
+ "sort": "ASC",
53
+ "template": "\n${{CHANGELOG}}</details>\n\n## Uncategorized:\n\n<details><summary>Changelog</summary>\n\n${{UNCATEGORIZED}}\n</details>\n",
54
+ "pr_template": "- ${{TITLE}} by @${{AUTHOR}} :: PR: #${{NUMBER}}",
55
+ "empty_template": "${{OWNER}}\n${{REPO}}\n${{FROM_TAG}}\n${{TO_TAG}}",
56
+ "label_extractor": [
57
+ {
58
+ "pattern": "(.*tts.*)|(.*g2p.*)",
59
+ "target": "tts",
60
+ "flags": "gimu",
61
+ "on_property": ["title", "body"]
62
+ },
63
+ {
64
+ "pattern": "(.*asr.*)|(.*ctc.*)|(.*rnnt.*)|(.*transducer.*)|(.*dali.*)|(.*k2.*)",
65
+ "target": "asr",
66
+ "flags": "gimu",
67
+ "on_property": ["title", "body"]
68
+ },
69
+ {
70
+ "pattern": "(.*nlp.*)|(.*punctuation.*)|(.*capitalization.*)|(.*entity.*)|(.*glue.*)|(.*entity.*)|(.*retrieval.*)|(.*entity.*)|(.*intent.*)|(.*slot.*)|(.*entity.*)|(.*language.*)|(.*qa.*)|(.*token class.*)|(.*text class.*)",
71
+ "target": "nlp",
72
+ "flags": "gimu",
73
+ "on_property": ["title", "body"]
74
+ },
75
+ {
76
+ "pattern": "(.*nmt.*)|(.*bignlp.*)|(.*megatron.*)|(.*machine.*)|(.*translation.*)|(.*gpt.*)",
77
+ "target": "nmt",
78
+ "flags": "gimu",
79
+ "on_property": ["title", "body"]
80
+ },
81
+ {
82
+ "pattern": "(.*tn.*)|(.*itn.*)|(.*text norm.*)",
83
+ "target": "tn",
84
+ "flags": "gimu",
85
+ "on_property": ["title", "body"]
86
+ },
87
+ {
88
+ "pattern": "(.*sde.*)|(.*ctc segment.*)",
89
+ "target": "tools",
90
+ "flags": "gimu",
91
+ "on_property": ["title", "body"]
92
+ },
93
+ {
94
+ "pattern": "(.*trt.*)|(.*onnx.*)|(.*export.*)",
95
+ "target": "export",
96
+ "flags": "gimu",
97
+ "on_property": ["title", "body"]
98
+ },
99
+ {
100
+ "pattern": "(.*\\[x\\] Documentation.*)",
101
+ "target": "docs",
102
+ "flags": "gmu",
103
+ "on_property": ["title", "body"]
104
+ },
105
+ {
106
+ "pattern": "(.*\\[x\\] Bugfix.*)|(.*patch.*)",
107
+ "target": "bug",
108
+ "flags": "gmu",
109
+ "on_property": ["title", "body"]
110
+ },
111
+ {
112
+ "pattern": "(.*cherry-pick.*)|(.*cherrypick.*)",
113
+ "target": "cherrypick",
114
+ "flags": "gimu",
115
+ "on_property": ["title", "body"]
116
+ }
117
+ ],
118
+ "duplicate_filter": {
119
+ "pattern": ".+",
120
+ "on_property": "title",
121
+ "method": "match"
122
+ },
123
+ "transformers": [
124
+ ],
125
+ "max_tags_to_fetch": 100,
126
+ "max_pull_requests": 500,
127
+ "max_back_track_time_days": 365,
128
+ "exclude_merge_branches": [
129
+ ],
130
+ "tag_resolver": {
131
+ "method": "semver"
132
+ }
133
+ }
134
+
.github/workflows/config/codeql.yml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ name: "CodeQL config"
2
+
3
+ paths:
4
+ - nemo/
5
+ - tests/
6
+ - tools/
7
+ - scripts/
8
+ - examples/
9
+ - .github/
.github/workflows/copyright-check.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ name: Copyright check
16
+
17
+ on:
18
+ pull_request:
19
+
20
+ jobs:
21
+ copyright-check:
22
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_copyright_check.yml@v0.2.0
.github/workflows/gh-docs.yml ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: gh-docs-build
2
+ on:
3
+ push:
4
+ pull_request:
5
+ paths:
6
+ - "**"
7
+
8
+ # Set the access for individual scopes
9
+ permissions: write-all
10
+
11
+ env:
12
+ PYTHON_VERSION: "3.11"
13
+
14
+ jobs:
15
+ deploy:
16
+ runs-on: ubuntu-latest
17
+
18
+ container:
19
+ image: squidfunk/mkdocs-material
20
+
21
+ steps:
22
+ - uses: actions/checkout@v4
23
+ if: github.event.repository.fork == false
24
+ with:
25
+ ref: gh-pages-src
26
+
27
+ - name: "Correct github config"
28
+ if: github.event.repository.fork == false
29
+ run: |
30
+ git config --global --add safe.directory "$GITHUB_WORKSPACE"
31
+ git config --global user.name "${GITHUB_ACTOR}"
32
+ git config --global user.email "${GITHUB_ACTOR}@users.noreply.${GITHUB_DOMAIN:-"github.com"}"
33
+ remote_repo="https://x-access-token:${GITHUB_TOKEN}@${GITHUB_DOMAIN:-"github.com"}/${GITHUB_REPOSITORY}.git"
34
+ echo "${remote_repo}"
35
+ git remote rm origin
36
+ git remote add origin "${remote_repo}"
37
+
38
+ - name: "Deploy Github Page"
39
+ continue-on-error: true
40
+ run: mkdocs gh-deploy --force
41
+
42
+ linkcheck:
43
+ runs-on: ubuntu-latest
44
+ steps:
45
+ - name: Checkout
46
+ uses: actions/checkout@v4
47
+
48
+ - name: Get changed files
49
+ id: changed-files
50
+ uses: step-security/changed-files@v45.0.1
51
+ with:
52
+ files: docs/**
53
+ files_separator: ","
54
+ separator: " "
55
+
56
+ - name: Set up Python ${{ env.PYTHON_VERSION }}
57
+ if: steps.changed-files.outputs.any_changed == 'true'
58
+ uses: actions/setup-python@v5
59
+ with:
60
+ python-version: ${{ env.PYTHON_VERSION }}
61
+
62
+ - name: Install Sphinx dependencies
63
+ if: steps.changed-files.outputs.any_changed == 'true'
64
+ run: python3 -m pip install -r requirements/requirements_docs.txt
65
+
66
+ - name: Linkcheck docs build
67
+ if: steps.changed-files.outputs.any_changed == 'true'
68
+ run: make -C docs linkcheck || true
69
+
70
+ - name: Eliminate false positives
71
+ if: steps.changed-files.outputs.any_changed == 'true'
72
+ run: ./docs/check_for_broken_links.sh
73
+
74
+ - name: Upload linkcheck output
75
+ if: steps.changed-files.outputs.any_changed == 'true'
76
+ uses: actions/upload-artifact@v4
77
+ with:
78
+ name: linkcheck-artifact
79
+ path: docs/build/linkcheck
80
+ if-no-files-found: error
81
+ retention-days: 7
.github/workflows/install-test.yml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI-Install-Check
2
+
3
+ on:
4
+ pull_request:
5
+ paths:
6
+ - "**"
7
+
8
+ concurrency:
9
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
10
+ cancel-in-progress: true
11
+
12
+ jobs:
13
+ test-installs-macos:
14
+ name: ${{ matrix.os }}-py${{ matrix.python }}-${{ matrix.installer }}
15
+ runs-on: ${{ matrix.os }}
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ os: [macos-latest]
20
+ python: ["3.10", "3.11", "3.12"]
21
+ installer: ["pip-install", "nemo-install"]
22
+ steps:
23
+ - name: Checkout repo
24
+ uses: actions/checkout@v2
25
+
26
+ - uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "${{ matrix.python }}"
29
+
30
+ - name: Install NeMo
31
+ env:
32
+ INSTALLER: ${{ matrix.installer }}
33
+ NEMO_TAG: ${{ github.sha }}
34
+ NEMO_REPO: ${{ github.server_url }}/${{ github.repository }}
35
+ run: |
36
+ if [[ "$INSTALLER" == "pip-install" ]]; then
37
+ pip install --no-cache-dir -U pip
38
+ pip install --no-cache-dir ".[all]"
39
+ else
40
+ export NEMO_TAG
41
+ export NEMO_REPO
42
+ export INSTALL_DIR=$(pwd)
43
+
44
+ bash docker/common/install_dep.sh --library all --mode install
45
+ pip install --no-cache-dir ".[all]"
46
+ fi
47
+
48
+ - name: Run import checks
49
+ run: |
50
+ # Run import checks
51
+ for collection in "asr" "tts" "nlp"; do
52
+ python tests/core_ptl/check_imports.py --domain "$collection"
53
+ done
54
+
55
+ test-installs-linux-amd:
56
+ name: ubuntu-22.04-amd-py${{ matrix.python }}-${{ matrix.installer }}
57
+ runs-on: ubuntu-22.04
58
+ strategy:
59
+ fail-fast: false
60
+ matrix:
61
+ python: ["3.10", "3.11", "3.12"]
62
+ installer: ["pip-install", "nemo-install"]
63
+ steps:
64
+ - name: Checkout repo
65
+ uses: actions/checkout@v2
66
+
67
+ - name: Install Python
68
+ uses: actions/setup-python@v5
69
+ with:
70
+ python-version: ${{ matrix.python }}
71
+
72
+ - name: Install NeMo
73
+ env:
74
+ INSTALLER: ${{ matrix.installer }}
75
+ run: |
76
+ if [ "$INSTALLER" = "pip-install" ]; then
77
+ pip install --upgrade pip
78
+ pip install ".[all]"
79
+ else
80
+ export INSTALL_DIR=$(pwd)
81
+ bash docker/common/install_dep.sh --library all --mode install
82
+ pip install --no-cache-dir ".[all]"
83
+ fi
84
+
85
+ - name: Run import checks
86
+ run: |
87
+ # Run import checks
88
+ for collection in "asr" "tts" "nlp"; do
89
+ python tests/core_ptl/check_imports.py --domain "$collection"
90
+ done
91
+
92
+ test-installs-linux-arm:
93
+ name: ubuntu-22.04-arm-py${{ matrix.python }}-${{ matrix.installer }}
94
+ runs-on: ubuntu-22.04-arm
95
+ strategy:
96
+ fail-fast: false
97
+ matrix:
98
+ python: ["3.10", "3.11", "3.12"]
99
+ installer: ["pip-install", "nemo-install"]
100
+ steps:
101
+ - name: Checkout repo
102
+ uses: actions/checkout@v2
103
+
104
+ - name: Install Python
105
+ uses: actions/setup-python@v5
106
+ with:
107
+ python-version: ${{ matrix.python }}
108
+
109
+ - name: Install NeMo
110
+ env:
111
+ INSTALLER: ${{ matrix.installer }}
112
+ run: |
113
+ if [ "$INSTALLER" = "pip-install" ]; then
114
+ pip install --upgrade pip
115
+ pip install -vvv ".[all]"
116
+ else
117
+ export INSTALL_DIR=$(pwd)
118
+ bash docker/common/install_dep.sh --library all --mode install
119
+ pip install --no-cache-dir ".[all]"
120
+ fi
121
+
122
+ - name: Run import checks
123
+ run: |
124
+ # Run import checks
125
+ for collection in "asr" "tts" "nlp"; do
126
+ python tests/core_ptl/check_imports.py --domain "$collection"
127
+ done
.github/workflows/labeler.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Pull Request Labeler"
2
+ on:
3
+ - pull_request_target
4
+
5
+ jobs:
6
+ triage:
7
+ permissions:
8
+ contents: read
9
+ pull-requests: write
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/labeler@v4
13
+ with:
14
+ repo-token: "${{ secrets.GITHUB_TOKEN }}"
.github/workflows/mcore-tag-bump-bot.yml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Regularly updates the CI container
2
+ name: Megatron Tag Bump Bot
3
+ on:
4
+ workflow_dispatch:
5
+ schedule:
6
+ - cron: 0 0 * * *
7
+
8
+ jobs:
9
+ get-release-branch-names:
10
+ runs-on: ubuntu-latest
11
+ outputs:
12
+ mcore: ${{ steps.get-branch.outputs.mcore_release_branch }}
13
+ nemo: ${{ steps.get-branch.outputs.nemo_release_branch }}
14
+ steps:
15
+ - name: Get release branch names
16
+ id: get-branch
17
+ run: |
18
+ latest_branch=$(git ls-remote --heads https://github.com/NVIDIA/Megatron-LM.git 'refs/heads/core_r*' |
19
+ grep -o 'core_r[0-9]\+\.[0-9]\+\.[0-9]\+' |
20
+ sort -V |
21
+ tail -n1)
22
+ echo "mcore_release_branch=$latest_branch" >> $GITHUB_OUTPUT
23
+
24
+ latest_branch=$(git ls-remote --heads https://github.com/NVIDIA/NeMo.git 'refs/heads/r*' |
25
+ grep -o 'r[0-9]\+\.[0-9]\+\.[0-9]\+' |
26
+ sort -V |
27
+ tail -n1)
28
+ echo "nemo_release_branch=$latest_branch" >> $GITHUB_OUTPUT
29
+
30
+ bump-tags:
31
+ needs: [get-release-branch-names]
32
+ strategy:
33
+ fail-fast: false
34
+ matrix:
35
+ include:
36
+ - nemo-target-branch: ${{ needs.get-release-branch-names.outputs.nemo }}
37
+ mcore-target-branch: ${{ needs.get-release-branch-names.outputs.mcore }}
38
+ - nemo-target-branch: main
39
+ mcore-target-branch: main
40
+ uses: ./.github/workflows/_bump_mcore_tag.yml
41
+ with:
42
+ nemo-target-branch: ${{ matrix.nemo-target-branch }}
43
+ mcore-target-branch: ${{ matrix.mcore-target-branch }}
44
+ secrets:
45
+ PAT: ${{ secrets.PAT }}
46
+
47
+ notify:
48
+ if: failure()
49
+ runs-on: ubuntu-latest
50
+ needs: [bump-tags]
51
+ steps:
52
+ - name: Notify
53
+ env:
54
+ SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
55
+ SLACK_WEBHOOK_ADMIN: <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>
56
+ GITHUB_RUN_ID: ${{ github.run_id }}
57
+ GITHUB_REPOSITORY: ${{ github.repository }}
58
+ run: |
59
+ curl -X POST \
60
+ -H 'Content-type: application/json' \
61
+ --data "{\"text\":\":robot_joy: <https://github.com/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}|Mcore-bump-bot workflow> failed. Please fix manually.\n\ncc ${SLACK_WEBHOOK_ADMIN}\"}" \
62
+ $SLACK_WEBHOOK
.github/workflows/monitor-single-vm.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: ~shut down a single VM
2
+
3
+ on:
4
+ workflow_call:
5
+ inputs:
6
+ vm:
7
+ type: string
8
+ description: Name of VM
9
+ required: true
10
+ n_gpus:
11
+ type: string
12
+ description: Number of GPUs this VM has
13
+ required: true
14
+
15
+ jobs:
16
+ check-status-and-maybe-shutdown:
17
+ environment: main
18
+ runs-on: ${{ inputs.vm }}
19
+ outputs:
20
+ status: ${{ steps.status.outputs.main }}
21
+ steps:
22
+ - name: Check status
23
+ id: status
24
+ run: |
25
+ docker run --rm --runtime=nvidia --gpus ${{ inputs.n_gpus }} ubuntu nvidia-smi
26
+
27
+ NUM_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
28
+
29
+ if [[ $NUM_GPUS -ne ${{ inputs.n_gpus }} ]]; then
30
+ echo "Issues with GPU detected, will take this runner offline."
31
+ echo "main=degraded" >> "$GITHUB_OUTPUT"
32
+ else
33
+ echo "main=healthy" >> "$GITHUB_OUTPUT"
34
+ fi
35
+
36
+ - name: Send Slack message & Disconnect runner from GitHub
37
+ if: ${{ steps.status.outputs.main == 'degraded' || failure() }}
38
+ run: |
39
+ MESSAGE='{
40
+ "blocks": [
41
+ {
42
+ "type": "section",
43
+ "text": {
44
+ "type": "mrkdwn",
45
+ "text": ":alert: VM bot 🤖: Hey <!subteam^${{ secrets.SLACK_WEBHOOK_ADMIN }}>: VM `${{ inputs.vm }}` is having not the best day of their life, maybe bring them an apple or so."
46
+ }
47
+ }
48
+ ]
49
+ }'
50
+
51
+ curl -X POST -H "Content-type: application/json" --data "$MESSAGE" ${{ secrets.SLACK_WEBHOOK }}
52
+
53
+ cd /home/azureuser/actions-runner
54
+ echo ${{ secrets.VM_KEY }} | sudo -S ./svc.sh stop
.github/workflows/monitor-vms.yml ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Regularly updates the CI container
2
+ name: Reboots VMs in a controlled way
3
+ on:
4
+ schedule:
5
+ - cron: 0/15 * * * *
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ pre-flight:
10
+ runs-on: ubuntu-latest
11
+ if: github.repository_owner == 'NVIDIA'
12
+ outputs:
13
+ list-of-vms: ${{ steps.main.outputs.main }}
14
+ environment: main
15
+ steps:
16
+ - name: Get list of VMs
17
+ id: main
18
+ env:
19
+ GITHUB_TOKEN: ${{ secrets.PAT }}
20
+ run: |
21
+ RUNNERS=$(curl -L \
22
+ -H "Accept: application/vnd.github+json" \
23
+ -H "Authorization: Bearer $GITHUB_TOKEN" \
24
+ -H "X-GitHub-Api-Version: 2022-11-28" \
25
+ https://api.github.com/repos/NVIDIA/NeMo/actions/runners)
26
+
27
+ MATRIX=$(echo $RUNNERS \
28
+ | jq -c '[
29
+ .runners[]
30
+ | select(.status == "online")
31
+ | select(.name | contains("cpu") | not)
32
+ | {
33
+ "vm": .name,
34
+ "n_gpus": [
35
+ .labels[]
36
+ | select(.name | endswith("gpu")) | .name
37
+ ][0][:1]
38
+ }
39
+ ]
40
+ '
41
+ )
42
+ echo main=$MATRIX | tee -a "$GITHUB_OUTPUT"
43
+
44
+ maintenance:
45
+ needs: pre-flight
46
+ strategy:
47
+ fail-fast: false
48
+ matrix:
49
+ include: ${{ fromJSON(needs.pre-flight.outputs.list-of-vms )}}
50
+ uses: ./.github/workflows/monitor-single-vm.yml
51
+ with:
52
+ vm: ${{ matrix.vm }}
53
+ n_gpus: ${{ matrix.n_gpus }}
54
+ secrets: inherit # pragma: allowlist secret
.github/workflows/release-freeze.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Code freeze"
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ type_of_release:
7
+ type: choice
8
+ description: Type of release
9
+ options:
10
+ - major
11
+ - minor
12
+ freeze-commit:
13
+ type: string
14
+ description: Commit SHA to use for cut-off
15
+ required: false
16
+ default: main
17
+ mcore_version:
18
+ description: "Version of MCore to use (must be a valid git ref)"
19
+ required: true
20
+ type: string
21
+ dry-run:
22
+ type: boolean
23
+ description: Dry-run of code-freeze
24
+ required: false
25
+ default: true
26
+
27
+ jobs:
28
+ code-freeze:
29
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_code_freeze.yml@v0.25.2
30
+ with:
31
+ library-name: NeMo-Toolkit
32
+ python-package: nemo
33
+ release-type: ${{ inputs.type_of_release }}
34
+ freeze-commit: ${{ inputs.freeze-commit }}
35
+ dry-run: ${{ inputs.dry-run }}
36
+ use-pat: true
37
+ secrets:
38
+ SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
39
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
40
+ PAT: ${{ secrets.PAT }}
41
+
42
+ freeze-tags:
43
+ runs-on: ubuntu-latest
44
+ needs: [code-freeze]
45
+ environment: main
46
+ steps:
47
+ - name: Checkout repository
48
+ uses: actions/checkout@v4
49
+ with:
50
+ path: ${{ github.run_id }}
51
+ token: ${{ secrets.PAT }}
52
+ fetch-depth: 0
53
+ fetch-tags: true
54
+ ref: ${{ inputs.dry-run == true && inputs.freeze-commit || needs.code-freeze.outputs.release-branch }}
55
+
56
+ - name: Pin branch name in Notebooks
57
+ run: |
58
+ cd ${{ github.run_id }}
59
+ find tutorials -type f -name "*.ipynb" -exec sed -i "s/BRANCH = 'main'/BRANCH = '${{ needs.code-freeze.outputs.release-branch }}'/g" {} +
60
+
61
+ - name: Pin MCore in Dockerfile
62
+ run: |
63
+ cd ${{ github.run_id }}
64
+ sed -i 's/^ARG MCORE_TAG=.*$/ARG MCORE_TAG=${{ inputs.mcore_version }}/' docker/Dockerfile.ci
65
+
66
+ - name: Show status
67
+ run: |
68
+ cd ${{ github.run_id }}
69
+ git status
70
+
71
+ - name: Create PR
72
+ uses: peter-evans/create-pull-request@v6
73
+ id: create-pull-request
74
+ if: ${{ inputs.dry-run != true }}
75
+ with:
76
+ path: ${{ github.run_id }}
77
+ base: ${{ needs.code-freeze.outputs.release-branch }}
78
+ branch: ci/freeze-tags-${{ needs.code-freeze.outputs.release-branch }}
79
+ title: "Freeze tags in in `${{ needs.code-freeze.outputs.release-branch }}`"
80
+ body: |
81
+ 🚀 PR to freeze tags in `${{ needs.code-freeze.outputs.release-branch }}`.
82
+
83
+ commit-message: "[🤠]: Howdy folks, let's release NeMo `${{ needs.code-freeze.outputs.release-branch }}` !"
84
+ signoff: true
85
+ assignees: okoenig
.github/workflows/release.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: "Release Neural Modules"
15
+
16
+ on:
17
+ workflow_dispatch:
18
+ inputs:
19
+ release-ref:
20
+ description: Ref (SHA or branch name) to release
21
+ required: true
22
+ type: string
23
+ version-bump-branch:
24
+ description: Branch for version bump
25
+ required: true
26
+ type: string
27
+ dry-run:
28
+ description: Do not publish a wheel and GitHub release.
29
+ required: true
30
+ default: true
31
+ type: boolean
32
+
33
+ jobs:
34
+ release:
35
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_release_library.yml@v0.40.0
36
+ with:
37
+ release-ref: ${{ inputs.release-ref }}
38
+ python-package: nemo
39
+ python-version: "3.10"
40
+ library-name: Neural Modules
41
+ dry-run: ${{ inputs.dry-run }}
42
+ version-bump-branch: ${{ inputs.version-bump-branch }}
43
+ secrets:
44
+ TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
45
+ TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
46
+ SLACK_WEBHOOK_ADMIN: ${{ secrets.SLACK_WEBHOOK_ADMIN }}
47
+ SLACK_WEBHOOK: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
48
+ PAT: ${{ secrets.PAT }}
.github/workflows/secrets-detector.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: Secrets detector
15
+
16
+ on:
17
+ pull_request_target:
18
+ branches:
19
+ - 'main'
20
+
21
+ jobs:
22
+ main:
23
+ runs-on: ubuntu-latest
24
+ steps:
25
+ - name: Checkout repository
26
+ uses: actions/checkout@v4
27
+ with:
28
+ fetch-depth: 0
29
+ token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
30
+
31
+ - name: Install secrets detector
32
+ run: pip install detect-secrets
33
+
34
+ - name: Run on change-set
35
+ run: |
36
+ git diff --name-only --diff-filter=d --merge-base origin/main -z | xargs -0 detect-secrets-hook --disable-plugin HexHighEntropyString --baseline .secrets.baseline
37
+
38
+ - uses: EndBug/add-and-commit@v9
39
+ # Commit changes. Nothing is committed if no changes.
40
+ if: always()
41
+ with:
42
+ message: Update baseline
43
+ commit: --signoff
.github/workflows/update-buildcache.yml ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2020-2021, NVIDIA CORPORATION.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ name: Update build cache
15
+ on:
16
+ schedule:
17
+ - cron: 0 0 * * *
18
+ push:
19
+ branches:
20
+ - main
21
+ workflow_dispatch:
22
+ inputs:
23
+ runner:
24
+ required: false
25
+ default: self-hosted-azure-builder
26
+ type: string
27
+ description: VM to use for build
28
+
29
+ jobs:
30
+ pre-flight:
31
+ runs-on: ubuntu-latest
32
+ outputs:
33
+ build_args: ${{ steps.manifest.outputs.BUILD_ARGS }}
34
+ cache-from: ${{ steps.cache_from.outputs.LAST_PRS }}
35
+ steps:
36
+ - name: Checkout branch
37
+ uses: actions/checkout@v4
38
+
39
+ - name: Parse manifest.json
40
+ id: manifest
41
+ run: |
42
+ BUILD_ARGS=$(cat << EOF
43
+ BASE_IMAGE=$(cat requirements/manifest.json | jq -r '."ngc-pytorch"')
44
+ TRTLLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".repo')
45
+ TRTLLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."trt-llm".ref')
46
+ MLM_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".repo')
47
+ MLM_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies"."megatron-lm".ref')
48
+ TE_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.repo')
49
+ TE_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".transformer_engine.ref')
50
+ APEX_REPO=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.repo')
51
+ APEX_TAG=$(cat requirements/manifest.json | jq -r '."vcs-dependencies".apex.ref')
52
+ EOF
53
+ )
54
+
55
+ echo "BUILD_ARGS<<EOF" >> $GITHUB_OUTPUT
56
+ echo "$BUILD_ARGS" >> $GITHUB_OUTPUT
57
+ echo "EOF" >> $GITHUB_OUTPUT
58
+
59
+ - name: Get last merged PR
60
+ id: cache_from
61
+ env:
62
+ GH_TOKEN: ${{ github.token }}
63
+ run: |
64
+ LAST_PRS=$(gh api graphql -f query='
65
+ query {
66
+ repository(owner: "NVIDIA", name: "NeMo") {
67
+ pullRequests(states: MERGED, first: 100, orderBy: {field: UPDATED_AT, direction: DESC}) {
68
+ nodes {
69
+ number
70
+ }
71
+ }
72
+ }
73
+ }' | jq -r '.data.repository.pullRequests.nodes[].number' | while read -r number; do
74
+ echo "nemoci.azurecr.io/nemo_container-buildcache:$number"
75
+ done)
76
+
77
+ echo "LAST_PRS<<EOF" >> $GITHUB_OUTPUT
78
+ echo "$LAST_PRS" >> $GITHUB_OUTPUT
79
+ echo "EOF" >> $GITHUB_OUTPUT
80
+
81
+ cicd-test-container-build:
82
+ needs: [pre-flight]
83
+ uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_build_container.yml@v0.27.0
84
+ strategy:
85
+ fail-fast: false
86
+ matrix:
87
+ include:
88
+ - dockerfile: docker/Dockerfile.ci
89
+ image-name: nemo_container_automodel
90
+ - dockerfile: docker/Dockerfile.ci
91
+ image-name: nemo_container_nemo2
92
+ - dockerfile: docker/Dockerfile.ci
93
+ image-name: nemo_container_speech
94
+ - dockerfile: docker/Dockerfile.ci
95
+ image-name: nemo_container
96
+ - dockerfile: docker/Dockerfile.ci.export_deploy
97
+ image-name: nemo_container_export_deploy
98
+ with:
99
+ image-name: ${{ matrix.image-name }}
100
+ dockerfile: ${{ matrix.dockerfile }}
101
+ image-label: nemo-core
102
+ build-args: |
103
+ IMAGE_LABEL=nemo-core
104
+ NEMO_TAG=${{ github.sha }}
105
+ NEMO_REPO=https://github.com/NVIDIA/NeMo
106
+ ${{ needs.pre-flight.outputs.BUILD_ARGS }}
107
+ runner: ${{ inputs.runner || 'self-hosted-azure-builder' }}
108
+ use-inline-cache: false
109
+ prune-filter-timerange: 24h
110
+ cache-from: |
111
+ nemoci.azurecr.io/${{ matrix.image-name }}-buildcache:main
112
+ ${{ needs.pre-flight.outputs.cache-from }}
README.md ADDED
@@ -0,0 +1,544 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![Project Status: Active -- The project has reached a stable, usable state and is being actively developed.](http://www.repostatus.org/badges/latest/active.svg)](http://www.repostatus.org/#active)
2
+ [![Documentation](https://readthedocs.com/projects/nvidia-nemo/badge/?version=main)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/)
3
+ [![CodeQL](https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=main&event=push)](https://github.com/nvidia/nemo/actions/workflows/codeql.yml)
4
+ [![NeMo core license and license for collections in this repo](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://github.com/NVIDIA/NeMo/blob/master/LICENSE)
5
+ [![Release version](https://badge.fury.io/py/nemo-toolkit.svg)](https://badge.fury.io/py/nemo-toolkit)
6
+ [![Python version](https://img.shields.io/pypi/pyversions/nemo-toolkit.svg)](https://badge.fury.io/py/nemo-toolkit)
7
+ [![PyPi total downloads](https://static.pepy.tech/personalized-badge/nemo-toolkit?period=total&units=international_system&left_color=grey&right_color=brightgreen&left_text=downloads)](https://pepy.tech/project/nemo-toolkit)
8
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
9
+
10
+ # **NVIDIA NeMo Framework**
11
+
12
+ ## Latest News
13
+
14
+ <!-- markdownlint-disable -->
15
+ <details open>
16
+ <summary><b>Pretrain and finetune :hugs:Hugging Face models via AutoModel</b></summary>
17
+ Nemo Framework's latest feature AutoModel enables broad support for :hugs:Hugging Face models, with 25.04 focusing on
18
+
19
+
20
+ - <a href=https://huggingface.co/transformers/v3.5.1/model_doc/auto.html#automodelforcausallm>AutoModelForCausalLM<a> in the <a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending">Text Generation<a> category
21
+ - <a href=https://huggingface.co/docs/transformers/main/model_doc/auto#transformers.AutoModelForImageTextToText>AutoModelForImageTextToText<a> in the <a href="https://huggingface.co/models?pipeline_tag=image-text-to-text&sort=trending">Image-Text-to-Text<a> category
22
+
23
+ More Details in Blog: <a href=https://developer.nvidia.com/blog/run-hugging-face-models-instantly-with-day-0-support-from-nvidia-nemo-framework>Run Hugging Face Models Instantly with Day-0 Support from NVIDIA NeMo Framework<a>. Future releases will enable support for more model families such as Video Generation models.(2025-05-19)
24
+ </details>
25
+
26
+ <details open>
27
+ <summary><b>Training on Blackwell using Nemo</b></summary>
28
+ NeMo Framework has added Blackwell support, with <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html>performance benchmarks on GB200 & B200<a>. More optimizations to come in the upcoming releases.(2025-05-19)
29
+ </details>
30
+
31
+ <details open>
32
+ <summary><b>Training Performance on GPU Tuning Guide</b></summary>
33
+ NeMo Framework has published <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance-guide.html>a comprehensive guide for performance tuning to achieve optimal throughput<a>! (2025-05-19)
34
+ </details>
35
+
36
+ <details open>
37
+ <summary><b>New Models Support</b></summary>
38
+ NeMo Framework has added support for latest community models - <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/vlms/llama4.html>Llama 4<a>, <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/vision/diffusionmodels/flux.html>Flux<a>, <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama_nemotron.html>Llama Nemotron<a>, <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/hyena.html#>Hyena & Evo2<a>, <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/vlms/qwen2vl.html>Qwen2-VL<a>, <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/qwen2.html>Qwen2.5<a>, Gemma3, Qwen3-30B&32B.(2025-05-19)
39
+ </details>
40
+
41
+
42
+ <details open>
43
+ <summary><b>NeMo Framework 2.0</b></summary>
44
+ We've released NeMo 2.0, an update on the NeMo Framework which prioritizes modularity and ease-of-use. Please refer to the <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html>NeMo Framework User Guide</a> to get started.
45
+ </details>
46
+ <details open>
47
+ <summary><b>New Cosmos World Foundation Models Support</b></summary>
48
+ <details>
49
+ <summary> <a href="https://developer.nvidia.com/blog/advancing-physical-ai-with-nvidia-cosmos-world-foundation-model-platform">Advancing Physical AI with NVIDIA Cosmos World Foundation Model Platform </a> (2025-01-09)
50
+ </summary>
51
+ The end-to-end NVIDIA Cosmos platform accelerates world model development for physical AI systems. Built on CUDA, Cosmos combines state-of-the-art world foundation models, video tokenizers, and AI-accelerated data processing pipelines. Developers can accelerate world model development by fine-tuning Cosmos world foundation models or building new ones from the ground up. These models create realistic synthetic videos of environments and interactions, providing a scalable foundation for training complex systems, from simulating humanoid robots performing advanced actions to developing end-to-end autonomous driving models.
52
+ <br><br>
53
+ </details>
54
+ <details>
55
+ <summary>
56
+ <a href="https://developer.nvidia.com/blog/accelerate-custom-video-foundation-model-pipelines-with-new-nvidia-nemo-framework-capabilities/">
57
+ Accelerate Custom Video Foundation Model Pipelines with New NVIDIA NeMo Framework Capabilities
58
+ </a> (2025-01-07)
59
+ </summary>
60
+ The NeMo Framework now supports training and customizing the <a href="https://github.com/NVIDIA/Cosmos">NVIDIA Cosmos</a> collection of world foundation models. Cosmos leverages advanced text-to-world generation techniques to create fluid, coherent video content from natural language prompts.
61
+ <br><br>
62
+ You can also now accelerate your video processing step using the <a href="https://developer.nvidia.com/nemo-curator-video-processing-early-access">NeMo Curator</a> library, which provides optimized video processing and captioning features that can deliver up to 89x faster video processing when compared to an unoptimized CPU pipeline.
63
+ <br><br>
64
+ </details>
65
+ </details>
66
+ <details open>
67
+ <summary><b>Large Language Models and Multimodal Models</b></summary>
68
+ <details>
69
+ <summary>
70
+ <a href="https://developer.nvidia.com/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/">
71
+ State-of-the-Art Multimodal Generative AI Model Development with NVIDIA NeMo
72
+ </a> (2024-11-06)
73
+ </summary>
74
+ NVIDIA recently announced significant enhancements to the NeMo platform, focusing on multimodal generative AI models. The update includes NeMo Curator and the Cosmos tokenizer, which streamline the data curation process and enhance the quality of visual data. These tools are designed to handle large-scale data efficiently, making it easier to develop high-quality AI models for various applications, including robotics and autonomous driving. The Cosmos tokenizers, in particular, efficiently map visual data into compact, semantic tokens, which is crucial for training large-scale generative models. The tokenizer is available now on the <a href=http://github.com/NVIDIA/cosmos-tokenizer/NVIDIA/cosmos-tokenizer>NVIDIA/cosmos-tokenizer</a> GitHub repo and on <a href=https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8>Hugging Face</a>.
75
+ <br><br>
76
+ </details>
77
+ <details>
78
+ <summary>
79
+ <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/index.html#new-llama-3-1-support for more information/">
80
+ New Llama 3.1 Support
81
+ </a> (2024-07-23)
82
+ </summary>
83
+ The NeMo Framework now supports training and customizing the Llama 3.1 collection of LLMs from Meta.
84
+ <br><br>
85
+ </details>
86
+ <details>
87
+ <summary>
88
+ <a href="https://aws.amazon.com/blogs/machine-learning/accelerate-your-generative-ai-distributed-training-workloads-with-the-nvidia-nemo-framework-on-amazon-eks/">
89
+ Accelerate your Generative AI Distributed Training Workloads with the NVIDIA NeMo Framework on Amazon EKS
90
+ </a> (2024-07-16)
91
+ </summary>
92
+ NVIDIA NeMo Framework now runs distributed training workloads on an Amazon Elastic Kubernetes Service (Amazon EKS) cluster. For step-by-step instructions on creating an EKS cluster and running distributed training workloads with NeMo, see the GitHub repository <a href="https://github.com/aws-samples/awsome-distributed-training/tree/main/3.test_cases/2.nemo-launcher/EKS/"> here.</a>
93
+ <br><br>
94
+ </details>
95
+ <details>
96
+ <summary>
97
+ <a href="https://developer.nvidia.com/blog/nvidia-nemo-accelerates-llm-innovation-with-hybrid-state-space-model-support/">
98
+ NVIDIA NeMo Accelerates LLM Innovation with Hybrid State Space Model Support
99
+ </a> (2024/06/17)
100
+ </summary>
101
+ NVIDIA NeMo and Megatron Core now support pre-training and fine-tuning of state space models (SSMs). NeMo also supports training models based on the Griffin architecture as described by Google DeepMind.
102
+ <br><br>
103
+ </details>
104
+ <details>
105
+ <summary>
106
+ <a href="https://huggingface.co/models?sort=trending&search=nvidia%2Fnemotron-4-340B">
107
+ NVIDIA releases 340B base, instruct, and reward models pretrained on a total of 9T tokens.
108
+ </a> (2024-06-18)
109
+ </summary>
110
+ See documentation and tutorials for SFT, PEFT, and PTQ with
111
+ <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/nemotron/index.html">
112
+ Nemotron 340B
113
+ </a>
114
+ in the NeMo Framework User Guide.
115
+ <br><br>
116
+ </details>
117
+ <details>
118
+ <summary>
119
+ <a href="https://developer.nvidia.com/blog/nvidia-sets-new-generative-ai-performance-and-scale-records-in-mlperf-training-v4-0/">
120
+ NVIDIA sets new generative AI performance and scale records in MLPerf Training v4.0
121
+ </a> (2024/06/12)
122
+ </summary>
123
+ Using NVIDIA NeMo Framework and NVIDIA Hopper GPUs NVIDIA was able to scale to 11,616 H100 GPUs and achieve near-linear performance scaling on LLM pretraining.
124
+ NVIDIA also achieved the highest LLM fine-tuning performance and raised the bar for text-to-image training.
125
+ <br><br>
126
+ </details>
127
+ <details>
128
+ <summary>
129
+ <a href="https://cloud.google.com/blog/products/compute/gke-and-nvidia-nemo-framework-to-train-generative-ai-models">
130
+ Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE
131
+ </a> (2024/03/16)
132
+ </summary>
133
+ An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke.
134
+ The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
135
+ <br><br>
136
+ </details>
137
+ </details>
138
+ <details open>
139
+ <summary><b>Speech Recognition</b></summary>
140
+ <details>
141
+ <summary>
142
+ <a href="https://developer.nvidia.com/blog/accelerating-leaderboard-topping-asr-models-10x-with-nvidia-nemo/">
143
+ Accelerating Leaderboard-Topping ASR Models 10x with NVIDIA NeMo
144
+ </a> (2024/09/24)
145
+ </summary>
146
+ NVIDIA NeMo team released a number of inference optimizations for CTC, RNN-T, and TDT models that resulted in up to 10x inference speed-up.
147
+ These models now exceed an inverse real-time factor (RTFx) of 2,000, with some reaching RTFx of even 6,000.
148
+ <br><br>
149
+ </details>
150
+ <details>
151
+ <summary>
152
+ <a href="https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/">
153
+ New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model
154
+ </a> (2024/04/18)
155
+ </summary>
156
+ The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization.
157
+ Canary also provides bi-directional translation, between English and the three other supported languages.
158
+ <br><br>
159
+ </details>
160
+ <details>
161
+ <summary>
162
+ <a href="https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/">
163
+ Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models
164
+ </a> (2024/04/18)
165
+ </summary>
166
+ NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models.
167
+ These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.
168
+ <br><br>
169
+ </details>
170
+ <details>
171
+ <summary>
172
+ <a href="https://developer.nvidia.com/blog/turbocharge-asr-accuracy-and-speed-with-nvidia-nemo-parakeet-tdt/">
173
+ Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT
174
+ </a> (2024/04/18)
175
+ </summary>
176
+ NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT.
177
+ This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.
178
+ <br><br>
179
+ </details>
180
+ </details>
181
+ <!-- markdownlint-enable -->
182
+
183
+ ## Introduction
184
+
185
+ NVIDIA NeMo Framework is a scalable and cloud-native generative AI
186
+ framework built for researchers and PyTorch developers working on Large
187
+ Language Models (LLMs), Multimodal Models (MMs), Automatic Speech
188
+ Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV)
189
+ domains. It is designed to help you efficiently create, customize, and
190
+ deploy new generative AI models by leveraging existing code and
191
+ pre-trained model checkpoints.
192
+
193
+ For technical documentation, please see the [NeMo Framework User
194
+ Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html).
195
+
196
+ ## What's New in NeMo 2.0
197
+
198
+ NVIDIA NeMo 2.0 introduces several significant improvements over its predecessor, NeMo 1.0, enhancing flexibility, performance, and scalability.
199
+
200
+ - **Python-Based Configuration** - NeMo 2.0 transitions from YAML files to a Python-based configuration, providing more flexibility and control. This shift makes it easier to extend and customize configurations programmatically.
201
+
202
+ - **Modular Abstractions** - By adopting PyTorch Lightning’s modular abstractions, NeMo 2.0 simplifies adaptation and experimentation. This modular approach allows developers to more easily modify and experiment with different components of their models.
203
+
204
+ - **Scalability** - NeMo 2.0 seamlessly scaling large-scale experiments across thousands of GPUs using [NeMo-Run](https://github.com/NVIDIA/NeMo-Run), a powerful tool designed to streamline the configuration, execution, and management of machine learning experiments across computing environments.
205
+
206
+ Overall, these enhancements make NeMo 2.0 a powerful, scalable, and user-friendly framework for AI model development.
207
+
208
+ > [!IMPORTANT]
209
+ > NeMo 2.0 is currently supported by the LLM (large language model) and VLM (vision language model) collections.
210
+
211
+ ### Get Started with NeMo 2.0
212
+
213
+ - Refer to the [Quickstart](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/quickstart.html) for examples of using NeMo-Run to launch NeMo 2.0 experiments locally and on a slurm cluster.
214
+ - For more information about NeMo 2.0, see the [NeMo Framework User Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html).
215
+ - [NeMo 2.0 Recipes](https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/recipes) contains additional examples of launching large-scale runs using NeMo 2.0 and NeMo-Run.
216
+ - For an in-depth exploration of the main features of NeMo 2.0, see the [Feature Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/index.html#feature-guide).
217
+ - To transition from NeMo 1.0 to 2.0, see the [Migration Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/migration/index.html#migration-guide) for step-by-step instructions.
218
+
219
+ ### Get Started with Cosmos
220
+
221
+ NeMo Curator and NeMo Framework support video curation and post-training of the Cosmos World Foundation Models, which are open and available on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cosmos/collections/cosmos) and [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6). For more information on video datasets, refer to [NeMo Curator](https://developer.nvidia.com/nemo-curator). To post-train World Foundation Models using the NeMo Framework for your custom physical AI tasks, see the [Cosmos Diffusion models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/nemo/post_training/README.md) and the [Cosmos Autoregressive models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/autoregressive/nemo/post_training/README.md).
222
+
223
+ ## LLMs and MMs Training, Alignment, and Customization
224
+
225
+ All NeMo models are trained with
226
+ [Lightning](https://github.com/Lightning-AI/lightning). Training is
227
+ automatically scalable to 1000s of GPUs. You can check the performance benchmarks using the
228
+ latest NeMo Framework container [here](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html).
229
+
230
+ When applicable, NeMo models leverage cutting-edge distributed training
231
+ techniques, incorporating [parallelism
232
+ strategies](https://docs.nvidia.com/nemo-framework/user-guide/latest/modeloverview.html)
233
+ to enable efficient training of very large models. These techniques
234
+ include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully
235
+ Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed
236
+ Precision Training with BFloat16 and FP8, as well as others.
237
+
238
+ NeMo Transformer-based LLMs and MMs utilize [NVIDIA Transformer
239
+ Engine](https://github.com/NVIDIA/TransformerEngine) for FP8 training on
240
+ NVIDIA Hopper GPUs, while leveraging [NVIDIA Megatron
241
+ Core](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) for
242
+ scaling Transformer model training.
243
+
244
+ NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM,
245
+ Direct Preference Optimization (DPO), and Reinforcement Learning from
246
+ Human Feedback (RLHF). See [NVIDIA NeMo
247
+ Aligner](https://github.com/NVIDIA/NeMo-Aligner) for more information.
248
+
249
+ In addition to supervised fine-tuning (SFT), NeMo also supports the
250
+ latest parameter efficient fine-tuning (PEFT) techniques such as LoRA,
251
+ P-Tuning, Adapters, and IA3. Refer to the [NeMo Framework User
252
+ Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/index.html)
253
+ for the full list of supported models and techniques.
254
+
255
+ ## LLMs and MMs Deployment and Optimization
256
+
257
+ NeMo LLMs and MMs can be deployed and optimized with [NVIDIA NeMo
258
+ Microservices](https://developer.nvidia.com/nemo-microservices-early-access).
259
+
260
+ ## Speech AI
261
+
262
+ NeMo ASR and TTS models can be optimized for inference and deployed for
263
+ production use cases with [NVIDIA Riva](https://developer.nvidia.com/riva).
264
+
265
+ ## NeMo Framework Launcher
266
+
267
+ > [!IMPORTANT]
268
+ > NeMo Framework Launcher is compatible with NeMo version 1.0 only. [NeMo-Run](https://github.com/NVIDIA/NeMo-Run) is recommended for launching experiments using NeMo 2.0.
269
+
270
+ [NeMo Framework
271
+ Launcher](https://github.com/NVIDIA/NeMo-Megatron-Launcher) is a
272
+ cloud-native tool that streamlines the NeMo Framework experience. It is
273
+ used for launching end-to-end NeMo Framework training jobs on CSPs and
274
+ Slurm clusters.
275
+
276
+ The NeMo Framework Launcher includes extensive recipes, scripts,
277
+ utilities, and documentation for training NeMo LLMs. It also includes
278
+ the NeMo Framework [Autoconfigurator](https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration),
279
+ which is designed to find the optimal model parallel configuration for
280
+ training on a specific cluster.
281
+
282
+ To get started quickly with the NeMo Framework Launcher, please see the
283
+ [NeMo Framework
284
+ Playbooks](https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html).
285
+ The NeMo Framework Launcher does not currently support ASR and TTS
286
+ training, but it will soon.
287
+
288
+ ## Get Started with NeMo Framework
289
+
290
+ Getting started with NeMo Framework is easy. State-of-the-art pretrained
291
+ NeMo models are freely available on [Hugging Face
292
+ Hub](https://huggingface.co/models?library=nemo&sort=downloads&search=nvidia)
293
+ and [NVIDIA
294
+ NGC](https://catalog.ngc.nvidia.com/models?query=nemo&orderBy=weightPopularDESC).
295
+ These models can be used to generate text or images, transcribe audio,
296
+ and synthesize speech in just a few lines of code.
297
+
298
+ We have extensive
299
+ [tutorials](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html)
300
+ that can be run on [Google Colab](https://colab.research.google.com) or
301
+ with our [NGC NeMo Framework
302
+ Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo).
303
+ We also have
304
+ [playbooks](https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html)
305
+ for users who want to train NeMo models with the NeMo Framework
306
+ Launcher.
307
+
308
+ For advanced users who want to train NeMo models from scratch or
309
+ fine-tune existing NeMo models, we have a full suite of [example
310
+ scripts](https://github.com/NVIDIA/NeMo/tree/main/examples) that support
311
+ multi-GPU/multi-node training.
312
+
313
+ ## Key Features
314
+
315
+ - [Large Language Models](nemo/collections/nlp/README.md)
316
+ - [Multimodal](nemo/collections/multimodal/README.md)
317
+ - [Automatic Speech Recognition](nemo/collections/asr/README.md)
318
+ - [Text to Speech](nemo/collections/tts/README.md)
319
+ - [Computer Vision](nemo/collections/vision/README.md)
320
+
321
+ ## Requirements
322
+
323
+ - Python 3.10 or above
324
+ - Pytorch 2.5 or above
325
+ - NVIDIA GPU (if you intend to do model training)
326
+
327
+ ## Developer Documentation
328
+
329
+ | Version | Status | Description |
330
+ | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------ |
331
+ | Latest | [![Documentation Status](https://readthedocs.com/projects/nvidia-nemo/badge/?version=main)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/) | [Documentation of the latest (i.e. main) branch.](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/) |
332
+ | Stable | [![Documentation Status](https://readthedocs.com/projects/nvidia-nemo/badge/?version=stable)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/) | [Documentation of the stable (i.e. most recent release)](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/) |
333
+
334
+ ## Install NeMo Framework
335
+
336
+ The NeMo Framework can be installed in a variety of ways, depending on
337
+ your needs. Depending on the domain, you may find one of the following
338
+ installation methods more suitable.
339
+
340
+ - [Conda / Pip](#conda--pip): Install NeMo-Framework with native Pip into a virtual environment.
341
+ - Used to explore NeMo on any supported platform.
342
+ - This is the recommended method for ASR and TTS domains.
343
+ - Limited feature-completeness for other domains.
344
+ - [NGC PyTorch container](#ngc-pytorch-container): Install NeMo-Framework from source with feature-completeness into a highly optimized container.
345
+ - For users that want to install from source in a highly optimized container.
346
+ - [NGC NeMo container](#ngc-nemo-container): Ready-to-go solution of NeMo-Framework
347
+ - For users that seek highest performance.
348
+ - Contains all dependencies installed and tested for performance and convergence.
349
+
350
+ ### Support matrix
351
+
352
+ NeMo-Framework provides tiers of support based on OS / Platform and mode of installation. Please refer the following overview of support levels:
353
+
354
+ - Fully supported: Max performance and feature-completeness.
355
+ - Limited supported: Used to explore NeMo.
356
+ - No support yet: In development.
357
+ - Deprecated: Support has reached end of life.
358
+
359
+ Please refer to the following table for current support levels:
360
+
361
+ | OS / Platform | Install from PyPi | Source into NGC container |
362
+ |----------------------------|-------------------|---------------------------|
363
+ | `linux` - `amd64/x84_64` | Limited support | Full support |
364
+ | `linux` - `arm64` | Limited support | Limited support |
365
+ | `darwin` - `amd64/x64_64` | Deprecated | Deprecated |
366
+ | `darwin` - `arm64` | Limited support | Limited support |
367
+ | `windows` - `amd64/x64_64` | No support yet | No support yet |
368
+ | `windows` - `arm64` | No support yet | No support yet |
369
+
370
+ ### Conda / Pip
371
+
372
+ Install NeMo in a fresh Conda environment:
373
+
374
+ ```bash
375
+ conda create --name nemo python==3.10.12
376
+ conda activate nemo
377
+ ```
378
+
379
+ #### Pick the right version
380
+
381
+ NeMo-Framework publishes pre-built wheels with each release.
382
+ To install nemo_toolkit from such a wheel, use the following installation method:
383
+
384
+ ```bash
385
+ pip install "nemo_toolkit[all]"
386
+ ```
387
+
388
+ If a more specific version is desired, we recommend a Pip-VCS install. From [NVIDIA/NeMo](github.com/NVIDIA/NeMo), fetch the commit, branch, or tag that you would like to install.
389
+ To install nemo_toolkit from this Git reference `$REF`, use the following installation method:
390
+
391
+ ```bash
392
+ git clone https://github.com/NVIDIA/NeMo
393
+ cd NeMo
394
+ git checkout @${REF:-'main'}
395
+ pip install '.[all]'
396
+ ```
397
+
398
+ #### Install a specific Domain
399
+
400
+ To install a specific domain of NeMo, you must first install the
401
+ nemo_toolkit using the instructions listed above. Then, you run the
402
+ following domain-specific commands:
403
+
404
+ ```bash
405
+ pip install nemo_toolkit['all'] # or pip install "nemo_toolkit['all']@git+https://github.com/NVIDIA/NeMo@${REF:-'main'}"
406
+ pip install nemo_toolkit['asr'] # or pip install "nemo_toolkit['asr']@git+https://github.com/NVIDIA/NeMo@$REF:-'main'}"
407
+ pip install nemo_toolkit['nlp'] # or pip install "nemo_toolkit['nlp']@git+https://github.com/NVIDIA/NeMo@${REF:-'main'}"
408
+ pip install nemo_toolkit['tts'] # or pip install "nemo_toolkit['tts']@git+https://github.com/NVIDIA/NeMo@${REF:-'main'}"
409
+ pip install nemo_toolkit['vision'] # or pip install "nemo_toolkit['vision']@git+https://github.com/NVIDIA/NeMo@${REF:-'main'}"
410
+ pip install nemo_toolkit['multimodal'] # or pip install "nemo_toolkit['multimodal']@git+https://github.com/NVIDIA/NeMo@${REF:-'main'}"
411
+ ```
412
+
413
+ ### NGC PyTorch container
414
+
415
+ **NOTE: The following steps are supported beginning with 24.04 (NeMo-Toolkit 2.3.0)**
416
+
417
+ We recommended that you start with a base NVIDIA PyTorch container:
418
+ nvcr.io/nvidia/pytorch:25.01-py3.
419
+
420
+ If starting with a base NVIDIA PyTorch container, you must first launch
421
+ the container:
422
+
423
+ ```bash
424
+ docker run \
425
+ --gpus all \
426
+ -it \
427
+ --rm \
428
+ --shm-size=16g \
429
+ --ulimit memlock=-1 \
430
+ --ulimit stack=67108864 \
431
+ nvcr.io/nvidia/pytorch:${NV_PYTORCH_TAG:-'nvcr.io/nvidia/pytorch:25.01-py3'}
432
+ ```
433
+
434
+ From [NVIDIA/NeMo](github.com/NVIDIA/NeMo), fetch the commit/branch/tag that you want to install.
435
+ To install nemo_toolkit including all of its dependencies from this Git reference `$REF`, use the following installation method:
436
+
437
+ ```bash
438
+ cd /opt
439
+ git clone https://github.com/NVIDIA/NeMo
440
+ cd NeMo
441
+ git checkout ${REF:-'main'}
442
+ bash docker/common/install_dep.sh --library all
443
+ pip install ".[all]"
444
+ ```
445
+
446
+ ## NGC NeMo container
447
+
448
+ NeMo containers are launched concurrently with NeMo version updates.
449
+ NeMo Framework now supports LLMs, MMs, ASR, and TTS in a single
450
+ consolidated Docker container. You can find additional information about
451
+ released containers on the [NeMo releases
452
+ page](https://github.com/NVIDIA/NeMo/releases).
453
+
454
+ To use a pre-built container, run the following code:
455
+
456
+ ```bash
457
+ docker run \
458
+ --gpus all \
459
+ -it \
460
+ --rm \
461
+ --shm-size=16g \
462
+ --ulimit memlock=-1 \
463
+ --ulimit stack=67108864 \
464
+ nvcr.io/nvidia/pytorch:${NV_PYTORCH_TAG:-'nvcr.io/nvidia/nemo:25.02'}
465
+ ```
466
+
467
+ ## Future Work
468
+
469
+ The NeMo Framework Launcher does not currently support ASR and TTS
470
+ training, but it will soon.
471
+
472
+ ## Discussions Board
473
+
474
+ FAQ can be found on the NeMo [Discussions
475
+ board](https://github.com/NVIDIA/NeMo/discussions). You are welcome to
476
+ ask questions or start discussions on the board.
477
+
478
+ ## Contribute to NeMo
479
+
480
+ We welcome community contributions! Please refer to
481
+ [CONTRIBUTING.md](https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md)
482
+ for the process.
483
+
484
+ ## Publications
485
+
486
+ We provide an ever-growing list of
487
+ [publications](https://nvidia.github.io/NeMo/publications/) that utilize
488
+ the NeMo Framework.
489
+
490
+ To contribute an article to the collection, please submit a pull request
491
+ to the `gh-pages-src` branch of this repository. For detailed
492
+ information, please consult the README located at the [gh-pages-src
493
+ branch](https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme).
494
+
495
+ ## Blogs
496
+
497
+ <!-- markdownlint-disable -->
498
+ <details open>
499
+ <summary><b>Large Language Models and Multimodal Models</b></summary>
500
+ <details>
501
+ <summary>
502
+ <a href="https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/">
503
+ Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
504
+ </a> (2024/03/06)
505
+ </summary>
506
+ Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework.
507
+ The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation.
508
+ Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
509
+ <br><br>
510
+ </details>
511
+ <details>
512
+ <summary>
513
+ <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/">
514
+ New NVIDIA NeMo Framework Features and NVIDIA H200
515
+ </a> (2023/12/06)
516
+ </summary>
517
+ NVIDIA NeMo Framework now includes several optimizations and enhancements,
518
+ including:
519
+ 1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models,
520
+ 2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale,
521
+ 3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and
522
+ 4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
523
+ <br><br>
524
+ <a href="https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility">
525
+ <img src="https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png" alt="H200-NeMo-performance" style="width: 600px;"></a>
526
+ <br><br>
527
+ </details>
528
+ <details>
529
+ <summary>
530
+ <a href="https://blogs.nvidia.com/blog/nemo-amazon-titan/">
531
+ NVIDIA now powers training for Amazon Titan Foundation models
532
+ </a> (2023/11/28)
533
+ </summary>
534
+ NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs).
535
+ The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock.
536
+ The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
537
+ <br><br>
538
+ </details>
539
+ </details>
540
+ <!-- markdownlint-enable -->
541
+
542
+ ## Licenses
543
+
544
+ NeMo is licensed under the [Apache License 2.0](https://github.com/NVIDIA/NeMo?tab=Apache-2.0-1-ov-file).
canary_results/canary-small/checkpoints/canary-small.nemo ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42b006d88479603ae6ea345199c0f29ea686b7a4d0c4c1f2eaa30584c69c2c53
3
+ size 721438720
canary_results/canary-small/cmd-args.log ADDED
@@ -0,0 +1 @@
 
 
1
+ scripts/speech_to_text_aed.py --config-path=/home/ubuntu/NeMo/config --config-name=fast-conformer_aed.yaml name=canary-small model.prompt_format=canary2 model.train_ds.manifest_filepath=/home/ubuntu/NeMo/data/tsukasa_train.json model.validation_ds.manifest_filepath=/home/ubuntu/NeMo/data/tsukasa_val.json model.test_ds.manifest_filepath=/home/ubuntu/NeMo/data/tsukasa_test.json model.tokenizer.langs.ja.dir=/home/ubuntu/NeMo/tokenizers/jp_TSUKA_1024/tokenizer_spe_bpe_v1024 model.tokenizer.langs.spl_tokens.dir=/home/ubuntu/NeMo/tokenizers/spl_tokens spl_tokens.model_dir=/home/ubuntu/NeMo/tokenizers/spl_tokens model.encoder.n_layers=17 model.transf_decoder.config_dict.num_layers=4 model.transf_decoder.config_dict.max_sequence_length=512 model.model_defaults.asr_enc_hidden=512 model.model_defaults.lm_dec_hidden=1024 exp_manager.exp_dir=canary_results exp_manager.resume_ignore_no_checkpoint=true trainer.max_steps=200_000 trainer.log_every_n_steps=50
canary_results/canary-small/git-info.log ADDED
The diff for this file is too large to render. See raw diff
 
canary_results/canary-small/lightning_logs.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
2
+
3
+ | Name | Type | Params | Mode
4
+ -----------------------------------------------------------------------------------
5
+ 0 | preprocessor | AudioToMelSpectrogramPreprocessor | 0 | train
6
+ 1 | encoder | ConformerEncoder | 109 M | train
7
+ 2 | encoder_decoder_proj | Linear | 525 K | train
8
+ 3 | transf_decoder | TransformerDecoderNM | 69.5 M | train
9
+ 4 | log_softmax | TokenClassifier | 2.3 M | train
10
+ 5 | loss | SmoothedCrossEntropyLoss | 0 | train
11
+ 6 | spec_augmentation | SpectrogramAugmentation | 0 | train
12
+ 7 | val_loss | GlobalAverageLossMetric | 0 | train
13
+ 8 | wer | WER | 0 | train
14
+ 9 | bleu | BLEU | 0 | train
15
+ -----------------------------------------------------------------------------------
16
+ 179 M Trainable params
17
+ 0 Non-trainable params
18
+ 179 M Total params
19
+ 718.243 Total estimated model params size (MB)
20
+ 600 Modules in train mode
21
+ 0 Modules in eval mode