| { | |
| "bomFormat": "CycloneDX", | |
| "specVersion": "1.6", | |
| "serialNumber": "urn:uuid:dd499724-872b-4392-817f-8511a1cd9113", | |
| "version": 1, | |
| "metadata": { | |
| "timestamp": "2025-06-05T09:37:28.708234+00:00", | |
| "component": { | |
| "type": "machine-learning-model", | |
| "bom-ref": "nvidia/canary-1b-58688bd0-57c9-5752-b615-3abc9265c7a1", | |
| "name": "nvidia/canary-1b", | |
| "externalReferences": [ | |
| { | |
| "url": "https://huggingface.co/nvidia/canary-1b", | |
| "type": "documentation" | |
| } | |
| ], | |
| "modelCard": { | |
| "modelParameters": { | |
| "task": "automatic-speech-recognition", | |
| "datasets": [ | |
| { | |
| "ref": "librispeech_asr-7baf0ed9-b50c-5f93-8c23-49a2b8749c19" | |
| }, | |
| { | |
| "ref": "fisher_corpus-a0c6e2c1-e876-5c66-89b2-cb93697b2a1c" | |
| }, | |
| { | |
| "ref": "Switchboard-1-b54b0d1d-3005-514e-9668-98d3c19f793f" | |
| }, | |
| { | |
| "ref": "WSJ-0-095442e6-ea65-5f6d-b360-432c7a2f501d" | |
| }, | |
| { | |
| "ref": "WSJ-1-0ef003e6-350d-50bb-9df7-9491b0c9b0b3" | |
| }, | |
| { | |
| "ref": "National-Singapore-Corpus-Part-1-1fbb2914-35aa-5126-9a84-a8b77169254c" | |
| }, | |
| { | |
| "ref": "National-Singapore-Corpus-Part-6-4f83cf7f-3026-5a77-ae37-28a73d4abc24" | |
| }, | |
| { | |
| "ref": "vctk-d80444bd-bcc6-5c25-8570-061bb96dae38" | |
| }, | |
| { | |
| "ref": "voxpopuli-15fb6343-a710-54f9-842b-3a1b43d6a630" | |
| }, | |
| { | |
| "ref": "europarl-7e07ffed-425e-5e05-8847-08a1899f0ac1" | |
| }, | |
| { | |
| "ref": "multilingual_librispeech-f260ef31-1d5d-54fe-8e61-88c397c0b7ce" | |
| }, | |
| { | |
| "ref": "mozilla-foundation/common_voice_8_0-a994a71f-f9f5-5f65-a3fa-51a56293cd8e" | |
| }, | |
| { | |
| "ref": "MLCommons/peoples_speech-f88dc766-1de0-51c6-865d-16930ec19be6" | |
| } | |
| ] | |
| }, | |
| "properties": [ | |
| { | |
| "name": "library_name", | |
| "value": "nemo" | |
| } | |
| ], | |
| "quantitativeAnalysis": { | |
| "performanceMetrics": [ | |
| { | |
| "slice": "dataset: librispeech_asr, split: test, config: other", | |
| "type": "wer", | |
| "value": 2.89 | |
| }, | |
| { | |
| "slice": "dataset: kensho/spgispeech, split: test, config: test", | |
| "type": "wer", | |
| "value": 4.79 | |
| }, | |
| { | |
| "slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: en", | |
| "type": "wer", | |
| "value": 7.97 | |
| }, | |
| { | |
| "slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: de", | |
| "type": "wer", | |
| "value": 4.61 | |
| }, | |
| { | |
| "slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: es", | |
| "type": "wer", | |
| "value": 3.99 | |
| }, | |
| { | |
| "slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: fr", | |
| "type": "wer", | |
| "value": 6.53 | |
| }, | |
| { | |
| "slice": "dataset: google/fleurs, split: test, config: en_us", | |
| "type": "bleu", | |
| "value": 32.15 | |
| }, | |
| { | |
| "slice": "dataset: google/fleurs, split: test, config: en_us", | |
| "type": "bleu", | |
| "value": 22.66 | |
| }, | |
| { | |
| "slice": "dataset: google/fleurs, split: test, config: en_us", | |
| "type": "bleu", | |
| "value": 40.76 | |
| }, | |
| { | |
| "slice": "dataset: google/fleurs, split: test, config: de_de", | |
| "type": "bleu", | |
| "value": 33.98 | |
| }, | |
| { | |
| "slice": "dataset: google/fleurs, split: test, config: es_419", | |
| "type": "bleu", | |
| "value": 21.8 | |
| }, | |
| { | |
| "slice": "dataset: google/fleurs, split: test, config: fr_fr", | |
| "type": "bleu", | |
| "value": 30.95 | |
| }, | |
| { | |
| "slice": "dataset: covost2, split: test, config: de_de", | |
| "type": "bleu", | |
| "value": 37.67 | |
| }, | |
| { | |
| "slice": "dataset: covost2, split: test, config: es_419", | |
| "type": "bleu", | |
| "value": 40.7 | |
| }, | |
| { | |
| "slice": "dataset: covost2, split: test, config: fr_fr", | |
| "type": "bleu", | |
| "value": 40.42 | |
| } | |
| ] | |
| } | |
| }, | |
| "authors": [ | |
| { | |
| "name": "nvidia" | |
| } | |
| ], | |
| "licenses": [ | |
| { | |
| "license": { | |
| "id": "CC-BY-NC-4.0", | |
| "url": "https://spdx.org/licenses/CC-BY-NC-4.0.html" | |
| } | |
| } | |
| ], | |
| "tags": [ | |
| "nemo", | |
| "automatic-speech-recognition", | |
| "automatic-speech-translation", | |
| "speech", | |
| "audio", | |
| "Transformer", | |
| "FastConformer", | |
| "Conformer", | |
| "pytorch", | |
| "NeMo", | |
| "hf-asr-leaderboard", | |
| "en", | |
| "de", | |
| "es", | |
| "fr", | |
| "dataset:librispeech_asr", | |
| "dataset:fisher_corpus", | |
| "dataset:Switchboard-1", | |
| "dataset:WSJ-0", | |
| "dataset:WSJ-1", | |
| "dataset:National-Singapore-Corpus-Part-1", | |
| "dataset:National-Singapore-Corpus-Part-6", | |
| "dataset:vctk", | |
| "dataset:voxpopuli", | |
| "dataset:europarl", | |
| "dataset:multilingual_librispeech", | |
| "dataset:mozilla-foundation/common_voice_8_0", | |
| "dataset:MLCommons/peoples_speech", | |
| "arxiv:2305.05084", | |
| "arxiv:1706.03762", | |
| "license:cc-by-nc-4.0", | |
| "model-index", | |
| "region:us" | |
| ] | |
| } | |
| }, | |
| "components": [ | |
| { | |
| "type": "data", | |
| "bom-ref": "librispeech_asr-7baf0ed9-b50c-5f93-8c23-49a2b8749c19", | |
| "name": "librispeech_asr", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "librispeech_asr-7baf0ed9-b50c-5f93-8c23-49a2b8749c19", | |
| "name": "librispeech_asr", | |
| "contents": { | |
| "url": "https://huggingface.co/datasets/librispeech_asr", | |
| "properties": [ | |
| { | |
| "name": "task_categories", | |
| "value": "automatic-speech-recognition, audio-classification" | |
| }, | |
| { | |
| "name": "task_ids", | |
| "value": "speaker-identification" | |
| }, | |
| { | |
| "name": "language", | |
| "value": "en" | |
| }, | |
| { | |
| "name": "size_categories", | |
| "value": "100K<n<1M" | |
| }, | |
| { | |
| "name": "annotations_creators", | |
| "value": "expert-generated" | |
| }, | |
| { | |
| "name": "language_creators", | |
| "value": "crowdsourced, expert-generated" | |
| }, | |
| { | |
| "name": "pretty_name", | |
| "value": "LibriSpeech" | |
| }, | |
| { | |
| "name": "source_datasets", | |
| "value": "original" | |
| }, | |
| { | |
| "name": "paperswithcode_id", | |
| "value": "librispeech-1" | |
| }, | |
| { | |
| "name": "license", | |
| "value": "cc-by-4.0" | |
| } | |
| ] | |
| }, | |
| "governance": { | |
| "owners": [ | |
| { | |
| "organization": { | |
| "name": "openslr", | |
| "url": "https://huggingface.co/openslr" | |
| } | |
| } | |
| ] | |
| }, | |
| "description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "fisher_corpus-a0c6e2c1-e876-5c66-89b2-cb93697b2a1c", | |
| "name": "fisher_corpus", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "fisher_corpus-a0c6e2c1-e876-5c66-89b2-cb93697b2a1c", | |
| "name": "fisher_corpus" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "Switchboard-1-b54b0d1d-3005-514e-9668-98d3c19f793f", | |
| "name": "Switchboard-1", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "Switchboard-1-b54b0d1d-3005-514e-9668-98d3c19f793f", | |
| "name": "Switchboard-1" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "WSJ-0-095442e6-ea65-5f6d-b360-432c7a2f501d", | |
| "name": "WSJ-0", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "WSJ-0-095442e6-ea65-5f6d-b360-432c7a2f501d", | |
| "name": "WSJ-0" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "WSJ-1-0ef003e6-350d-50bb-9df7-9491b0c9b0b3", | |
| "name": "WSJ-1", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "WSJ-1-0ef003e6-350d-50bb-9df7-9491b0c9b0b3", | |
| "name": "WSJ-1" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "National-Singapore-Corpus-Part-1-1fbb2914-35aa-5126-9a84-a8b77169254c", | |
| "name": "National-Singapore-Corpus-Part-1", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "National-Singapore-Corpus-Part-1-1fbb2914-35aa-5126-9a84-a8b77169254c", | |
| "name": "National-Singapore-Corpus-Part-1" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "National-Singapore-Corpus-Part-6-4f83cf7f-3026-5a77-ae37-28a73d4abc24", | |
| "name": "National-Singapore-Corpus-Part-6", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "National-Singapore-Corpus-Part-6-4f83cf7f-3026-5a77-ae37-28a73d4abc24", | |
| "name": "National-Singapore-Corpus-Part-6" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "vctk-d80444bd-bcc6-5c25-8570-061bb96dae38", | |
| "name": "vctk", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "vctk-d80444bd-bcc6-5c25-8570-061bb96dae38", | |
| "name": "vctk", | |
| "contents": { | |
| "url": "https://huggingface.co/datasets/vctk", | |
| "properties": [ | |
| { | |
| "name": "task_categories", | |
| "value": "automatic-speech-recognition, text-to-speech, text-to-audio" | |
| }, | |
| { | |
| "name": "task_ids", | |
| "value": "" | |
| }, | |
| { | |
| "name": "language", | |
| "value": "en" | |
| }, | |
| { | |
| "name": "size_categories", | |
| "value": "10K<n<100K" | |
| }, | |
| { | |
| "name": "annotations_creators", | |
| "value": "expert-generated" | |
| }, | |
| { | |
| "name": "language_creators", | |
| "value": "crowdsourced" | |
| }, | |
| { | |
| "name": "pretty_name", | |
| "value": "VCTK" | |
| }, | |
| { | |
| "name": "source_datasets", | |
| "value": "original" | |
| }, | |
| { | |
| "name": "paperswithcode_id", | |
| "value": "vctk" | |
| }, | |
| { | |
| "name": "license", | |
| "value": "cc-by-4.0" | |
| } | |
| ] | |
| }, | |
| "governance": { | |
| "owners": [ | |
| { | |
| "organization": { | |
| "name": "CSTR-Edinburgh", | |
| "url": "https://huggingface.co/CSTR-Edinburgh" | |
| } | |
| } | |
| ] | |
| }, | |
| "description": "The CSTR VCTK Corpus includes speech data uttered by 110 English speakers with various accents." | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "voxpopuli-15fb6343-a710-54f9-842b-3a1b43d6a630", | |
| "name": "voxpopuli", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "voxpopuli-15fb6343-a710-54f9-842b-3a1b43d6a630", | |
| "name": "voxpopuli" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "europarl-7e07ffed-425e-5e05-8847-08a1899f0ac1", | |
| "name": "europarl", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "europarl-7e07ffed-425e-5e05-8847-08a1899f0ac1", | |
| "name": "europarl" | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "multilingual_librispeech-f260ef31-1d5d-54fe-8e61-88c397c0b7ce", | |
| "name": "multilingual_librispeech", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "multilingual_librispeech-f260ef31-1d5d-54fe-8e61-88c397c0b7ce", | |
| "name": "multilingual_librispeech", | |
| "contents": { | |
| "url": "https://huggingface.co/datasets/multilingual_librispeech", | |
| "properties": [ | |
| { | |
| "name": "task_categories", | |
| "value": "automatic-speech-recognition, audio-classification" | |
| }, | |
| { | |
| "name": "task_ids", | |
| "value": "speaker-identification" | |
| }, | |
| { | |
| "name": "language", | |
| "value": "de, es, fr, it, nl, pl, pt" | |
| }, | |
| { | |
| "name": "size_categories", | |
| "value": "100K<n<1M" | |
| }, | |
| { | |
| "name": "annotations_creators", | |
| "value": "expert-generated" | |
| }, | |
| { | |
| "name": "language_creators", | |
| "value": "crowdsourced, expert-generated" | |
| }, | |
| { | |
| "name": "pretty_name", | |
| "value": "MultiLingual LibriSpeech" | |
| }, | |
| { | |
| "name": "source_datasets", | |
| "value": "original" | |
| }, | |
| { | |
| "name": "paperswithcode_id", | |
| "value": "librispeech-1" | |
| }, | |
| { | |
| "name": "license", | |
| "value": "cc-by-4.0" | |
| } | |
| ] | |
| }, | |
| "governance": { | |
| "owners": [ | |
| { | |
| "organization": { | |
| "name": "legacy-datasets", | |
| "url": "https://huggingface.co/legacy-datasets" | |
| } | |
| } | |
| ] | |
| }, | |
| "description": "Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish." | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "mozilla-foundation/common_voice_8_0-a994a71f-f9f5-5f65-a3fa-51a56293cd8e", | |
| "name": "mozilla-foundation/common_voice_8_0", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "mozilla-foundation/common_voice_8_0-a994a71f-f9f5-5f65-a3fa-51a56293cd8e", | |
| "name": "mozilla-foundation/common_voice_8_0", | |
| "contents": { | |
| "url": "https://huggingface.co/datasets/mozilla-foundation/common_voice_8_0", | |
| "properties": [ | |
| { | |
| "name": "task_categories", | |
| "value": "automatic-speech-recognition" | |
| }, | |
| { | |
| "name": "annotations_creators", | |
| "value": "crowdsourced" | |
| }, | |
| { | |
| "name": "language_creators", | |
| "value": "crowdsourced" | |
| }, | |
| { | |
| "name": "pretty_name", | |
| "value": "Common Voice Corpus 8.0" | |
| }, | |
| { | |
| "name": "source_datasets", | |
| "value": "extended|common_voice" | |
| }, | |
| { | |
| "name": "paperswithcode_id", | |
| "value": "common-voice" | |
| }, | |
| { | |
| "name": "license", | |
| "value": "cc0-1.0" | |
| } | |
| ] | |
| }, | |
| "governance": { | |
| "owners": [ | |
| { | |
| "organization": { | |
| "name": "mozilla-foundation", | |
| "url": "https://huggingface.co/mozilla-foundation" | |
| } | |
| } | |
| ] | |
| }, | |
| "description": "\n\t\n\t\t\n\t\tDataset Card for Common Voice Corpus 8.0\n\t\n\n\n\t\n\t\t\n\t\tDataset Summary\n\t\n\nThe Common Voice dataset consists of a unique MP3 and corresponding text file. \nMany of the 18243 recorded hours in the dataset also include demographic metadata like age, sex, and accent \nthat can help improve the accuracy of speech recognition engines.\nThe dataset currently consists of 14122 validated hours in 87 languages, but more voices and languages are always added. \nTake a look at the Languages page to\u2026 See the full description on the dataset page: https://huggingface.co/datasets/mozilla-foundation/common_voice_8_0." | |
| } | |
| ] | |
| }, | |
| { | |
| "type": "data", | |
| "bom-ref": "MLCommons/peoples_speech-f88dc766-1de0-51c6-865d-16930ec19be6", | |
| "name": "MLCommons/peoples_speech", | |
| "data": [ | |
| { | |
| "type": "dataset", | |
| "bom-ref": "MLCommons/peoples_speech-f88dc766-1de0-51c6-865d-16930ec19be6", | |
| "name": "MLCommons/peoples_speech", | |
| "contents": { | |
| "url": "https://huggingface.co/datasets/MLCommons/peoples_speech", | |
| "properties": [ | |
| { | |
| "name": "task_categories", | |
| "value": "automatic-speech-recognition" | |
| }, | |
| { | |
| "name": "task_ids", | |
| "value": "" | |
| }, | |
| { | |
| "name": "language", | |
| "value": "en" | |
| }, | |
| { | |
| "name": "size_categories", | |
| "value": "1T<n" | |
| }, | |
| { | |
| "name": "annotations_creators", | |
| "value": "crowdsourced, machine-generated" | |
| }, | |
| { | |
| "name": "language_creators", | |
| "value": "crowdsourced, machine-generated" | |
| }, | |
| { | |
| "name": "pretty_name", | |
| "value": "People's Speech" | |
| }, | |
| { | |
| "name": "source_datasets", | |
| "value": "original" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: clean {\"split\": \"train\", \"path\": \"clean/train-*\"}, {\"split\": \"validation\", \"path\": \"clean/validation-*\"}, {\"split\": \"test\", \"path\": \"clean/test-*\"}" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: clean_sa {\"split\": \"train\", \"path\": \"clean_sa/train-*\"}, {\"split\": \"validation\", \"path\": \"clean_sa/validation-*\"}, {\"split\": \"test\", \"path\": \"clean_sa/test-*\"}" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: dirty {\"split\": \"train\", \"path\": \"dirty/train-*\"}, {\"split\": \"validation\", \"path\": \"dirty/validation-*\"}, {\"split\": \"test\", \"path\": \"dirty/test-*\"}" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: dirty_sa {\"split\": \"train\", \"path\": \"dirty_sa/train-*\"}, {\"split\": \"validation\", \"path\": \"dirty_sa/validation-*\"}, {\"split\": \"test\", \"path\": \"dirty_sa/test-*\"}" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: microset {\"split\": \"train\", \"path\": \"microset/train-*\"}" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: test {\"split\": \"test\", \"path\": \"test/test-*\"}" | |
| }, | |
| { | |
| "name": "configs", | |
| "value": "Name of the dataset subset: validation {\"split\": \"validation\", \"path\": \"validation/validation-*\"}" | |
| }, | |
| { | |
| "name": "license", | |
| "value": "cc-by-2.0, cc-by-2.5, cc-by-3.0, cc-by-4.0, cc-by-sa-3.0, cc-by-sa-4.0" | |
| } | |
| ] | |
| }, | |
| "governance": { | |
| "owners": [ | |
| { | |
| "organization": { | |
| "name": "MLCommons", | |
| "url": "https://huggingface.co/MLCommons" | |
| } | |
| } | |
| ] | |
| }, | |
| "description": "\n\t\n\t\t\n\t\tDataset Card for People's Speech\n\t\n\n\n\t\n\t\t\n\t\tDataset Summary\n\t\n\nThe People's Speech Dataset is among the world's largest English speech recognition corpus today that is licensed for academic and commercial usage under CC-BY-SA and CC-BY 4.0. It includes 30,000+ hours of transcribed speech in English languages with a diverse set of speakers. This open dataset is large enough to train speech-to-text systems and crucially is available with a permissive license.\n\n\t\n\t\t\n\t\n\t\n\t\tSupported Tasks\u2026 See the full description on the dataset page: https://huggingface.co/datasets/MLCommons/peoples_speech." | |
| } | |
| ] | |
| } | |
| ] | |
| } |