canary-1b / nvidia_canary-1b.json

add AIBOM

43511d0 verified 11 months ago

29.9 kB

	{
	"bomFormat": "CycloneDX",
	"specVersion": "1.6",
	"serialNumber": "urn:uuid:dd499724-872b-4392-817f-8511a1cd9113",
	"version": 1,
	"metadata": {
	"timestamp": "2025-06-05T09:37:28.708234+00:00",
	"component": {
	"type": "machine-learning-model",
	"bom-ref": "nvidia/canary-1b-58688bd0-57c9-5752-b615-3abc9265c7a1",
	"name": "nvidia/canary-1b",
	"externalReferences": [
	{
	"url": "https://huggingface.co/nvidia/canary-1b",
	"type": "documentation"
	}
	],
	"modelCard": {
	"modelParameters": {
	"task": "automatic-speech-recognition",
	"datasets": [
	{
	"ref": "librispeech_asr-7baf0ed9-b50c-5f93-8c23-49a2b8749c19"
	},
	{
	"ref": "fisher_corpus-a0c6e2c1-e876-5c66-89b2-cb93697b2a1c"
	},
	{
	"ref": "Switchboard-1-b54b0d1d-3005-514e-9668-98d3c19f793f"
	},
	{
	"ref": "WSJ-0-095442e6-ea65-5f6d-b360-432c7a2f501d"
	},
	{
	"ref": "WSJ-1-0ef003e6-350d-50bb-9df7-9491b0c9b0b3"
	},
	{
	"ref": "National-Singapore-Corpus-Part-1-1fbb2914-35aa-5126-9a84-a8b77169254c"
	},
	{
	"ref": "National-Singapore-Corpus-Part-6-4f83cf7f-3026-5a77-ae37-28a73d4abc24"
	},
	{
	"ref": "vctk-d80444bd-bcc6-5c25-8570-061bb96dae38"
	},
	{
	"ref": "voxpopuli-15fb6343-a710-54f9-842b-3a1b43d6a630"
	},
	{
	"ref": "europarl-7e07ffed-425e-5e05-8847-08a1899f0ac1"
	},
	{
	"ref": "multilingual_librispeech-f260ef31-1d5d-54fe-8e61-88c397c0b7ce"
	},
	{
	"ref": "mozilla-foundation/common_voice_8_0-a994a71f-f9f5-5f65-a3fa-51a56293cd8e"
	},
	{
	"ref": "MLCommons/peoples_speech-f88dc766-1de0-51c6-865d-16930ec19be6"
	}
	]
	},
	"properties": [
	{
	"name": "library_name",
	"value": "nemo"
	}
	],
	"quantitativeAnalysis": {
	"performanceMetrics": [
	{
	"slice": "dataset: librispeech_asr, split: test, config: other",
	"type": "wer",
	"value": 2.89
	},
	{
	"slice": "dataset: kensho/spgispeech, split: test, config: test",
	"type": "wer",
	"value": 4.79
	},
	{
	"slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: en",
	"type": "wer",
	"value": 7.97
	},
	{
	"slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: de",
	"type": "wer",
	"value": 4.61
	},
	{
	"slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: es",
	"type": "wer",
	"value": 3.99
	},
	{
	"slice": "dataset: mozilla-foundation/common_voice_16_1, split: test, config: fr",
	"type": "wer",
	"value": 6.53
	},
	{
	"slice": "dataset: google/fleurs, split: test, config: en_us",
	"type": "bleu",
	"value": 32.15
	},
	{
	"slice": "dataset: google/fleurs, split: test, config: en_us",
	"type": "bleu",
	"value": 22.66
	},
	{
	"slice": "dataset: google/fleurs, split: test, config: en_us",
	"type": "bleu",
	"value": 40.76
	},
	{
	"slice": "dataset: google/fleurs, split: test, config: de_de",
	"type": "bleu",
	"value": 33.98
	},
	{
	"slice": "dataset: google/fleurs, split: test, config: es_419",
	"type": "bleu",
	"value": 21.8
	},
	{
	"slice": "dataset: google/fleurs, split: test, config: fr_fr",
	"type": "bleu",
	"value": 30.95
	},
	{
	"slice": "dataset: covost2, split: test, config: de_de",
	"type": "bleu",
	"value": 37.67
	},
	{
	"slice": "dataset: covost2, split: test, config: es_419",
	"type": "bleu",
	"value": 40.7
	},
	{
	"slice": "dataset: covost2, split: test, config: fr_fr",
	"type": "bleu",
	"value": 40.42
	}
	]
	}
	},
	"authors": [
	{
	"name": "nvidia"
	}
	],
	"licenses": [
	{
	"license": {
	"id": "CC-BY-NC-4.0",
	"url": "https://spdx.org/licenses/CC-BY-NC-4.0.html"
	}
	}
	],
	"tags": [
	"nemo",
	"automatic-speech-recognition",
	"automatic-speech-translation",
	"speech",
	"audio",
	"Transformer",
	"FastConformer",
	"Conformer",
	"pytorch",
	"NeMo",
	"hf-asr-leaderboard",
	"en",
	"de",
	"es",
	"fr",
	"dataset:librispeech_asr",
	"dataset:fisher_corpus",
	"dataset:Switchboard-1",
	"dataset:WSJ-0",
	"dataset:WSJ-1",
	"dataset:National-Singapore-Corpus-Part-1",
	"dataset:National-Singapore-Corpus-Part-6",
	"dataset:vctk",
	"dataset:voxpopuli",
	"dataset:europarl",
	"dataset:multilingual_librispeech",
	"dataset:mozilla-foundation/common_voice_8_0",
	"dataset:MLCommons/peoples_speech",
	"arxiv:2305.05084",
	"arxiv:1706.03762",
	"license:cc-by-nc-4.0",
	"model-index",
	"region:us"
	]
	}
	},
	"components": [
	{
	"type": "data",
	"bom-ref": "librispeech_asr-7baf0ed9-b50c-5f93-8c23-49a2b8749c19",
	"name": "librispeech_asr",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "librispeech_asr-7baf0ed9-b50c-5f93-8c23-49a2b8749c19",
	"name": "librispeech_asr",
	"contents": {
	"url": "https://huggingface.co/datasets/librispeech_asr",
	"properties": [
	{
	"name": "task_categories",
	"value": "automatic-speech-recognition, audio-classification"
	},
	{
	"name": "task_ids",
	"value": "speaker-identification"
	},
	{
	"name": "language",
	"value": "en"
	},
	{
	"name": "size_categories",
	"value": "100K<n<1M"
	},
	{
	"name": "annotations_creators",
	"value": "expert-generated"
	},
	{
	"name": "language_creators",
	"value": "crowdsourced, expert-generated"
	},
	{
	"name": "pretty_name",
	"value": "LibriSpeech"
	},
	{
	"name": "source_datasets",
	"value": "original"
	},
	{
	"name": "paperswithcode_id",
	"value": "librispeech-1"
	},
	{
	"name": "license",
	"value": "cc-by-4.0"
	}
	]
	},
	"governance": {
	"owners": [
	{
	"organization": {
	"name": "openslr",
	"url": "https://huggingface.co/openslr"
	}
	}
	]
	},
	"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "fisher_corpus-a0c6e2c1-e876-5c66-89b2-cb93697b2a1c",
	"name": "fisher_corpus",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "fisher_corpus-a0c6e2c1-e876-5c66-89b2-cb93697b2a1c",
	"name": "fisher_corpus"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "Switchboard-1-b54b0d1d-3005-514e-9668-98d3c19f793f",
	"name": "Switchboard-1",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "Switchboard-1-b54b0d1d-3005-514e-9668-98d3c19f793f",
	"name": "Switchboard-1"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "WSJ-0-095442e6-ea65-5f6d-b360-432c7a2f501d",
	"name": "WSJ-0",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "WSJ-0-095442e6-ea65-5f6d-b360-432c7a2f501d",
	"name": "WSJ-0"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "WSJ-1-0ef003e6-350d-50bb-9df7-9491b0c9b0b3",
	"name": "WSJ-1",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "WSJ-1-0ef003e6-350d-50bb-9df7-9491b0c9b0b3",
	"name": "WSJ-1"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "National-Singapore-Corpus-Part-1-1fbb2914-35aa-5126-9a84-a8b77169254c",
	"name": "National-Singapore-Corpus-Part-1",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "National-Singapore-Corpus-Part-1-1fbb2914-35aa-5126-9a84-a8b77169254c",
	"name": "National-Singapore-Corpus-Part-1"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "National-Singapore-Corpus-Part-6-4f83cf7f-3026-5a77-ae37-28a73d4abc24",
	"name": "National-Singapore-Corpus-Part-6",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "National-Singapore-Corpus-Part-6-4f83cf7f-3026-5a77-ae37-28a73d4abc24",
	"name": "National-Singapore-Corpus-Part-6"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "vctk-d80444bd-bcc6-5c25-8570-061bb96dae38",
	"name": "vctk",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "vctk-d80444bd-bcc6-5c25-8570-061bb96dae38",
	"name": "vctk",
	"contents": {
	"url": "https://huggingface.co/datasets/vctk",
	"properties": [
	{
	"name": "task_categories",
	"value": "automatic-speech-recognition, text-to-speech, text-to-audio"
	},
	{
	"name": "task_ids",
	"value": ""
	},
	{
	"name": "language",
	"value": "en"
	},
	{
	"name": "size_categories",
	"value": "10K<n<100K"
	},
	{
	"name": "annotations_creators",
	"value": "expert-generated"
	},
	{
	"name": "language_creators",
	"value": "crowdsourced"
	},
	{
	"name": "pretty_name",
	"value": "VCTK"
	},
	{
	"name": "source_datasets",
	"value": "original"
	},
	{
	"name": "paperswithcode_id",
	"value": "vctk"
	},
	{
	"name": "license",
	"value": "cc-by-4.0"
	}
	]
	},
	"governance": {
	"owners": [
	{
	"organization": {
	"name": "CSTR-Edinburgh",
	"url": "https://huggingface.co/CSTR-Edinburgh"
	}
	}
	]
	},
	"description": "The CSTR VCTK Corpus includes speech data uttered by 110 English speakers with various accents."
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "voxpopuli-15fb6343-a710-54f9-842b-3a1b43d6a630",
	"name": "voxpopuli",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "voxpopuli-15fb6343-a710-54f9-842b-3a1b43d6a630",
	"name": "voxpopuli"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "europarl-7e07ffed-425e-5e05-8847-08a1899f0ac1",
	"name": "europarl",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "europarl-7e07ffed-425e-5e05-8847-08a1899f0ac1",
	"name": "europarl"
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "multilingual_librispeech-f260ef31-1d5d-54fe-8e61-88c397c0b7ce",
	"name": "multilingual_librispeech",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "multilingual_librispeech-f260ef31-1d5d-54fe-8e61-88c397c0b7ce",
	"name": "multilingual_librispeech",
	"contents": {
	"url": "https://huggingface.co/datasets/multilingual_librispeech",
	"properties": [
	{
	"name": "task_categories",
	"value": "automatic-speech-recognition, audio-classification"
	},
	{
	"name": "task_ids",
	"value": "speaker-identification"
	},
	{
	"name": "language",
	"value": "de, es, fr, it, nl, pl, pt"
	},
	{
	"name": "size_categories",
	"value": "100K<n<1M"
	},
	{
	"name": "annotations_creators",
	"value": "expert-generated"
	},
	{
	"name": "language_creators",
	"value": "crowdsourced, expert-generated"
	},
	{
	"name": "pretty_name",
	"value": "MultiLingual LibriSpeech"
	},
	{
	"name": "source_datasets",
	"value": "original"
	},
	{
	"name": "paperswithcode_id",
	"value": "librispeech-1"
	},
	{
	"name": "license",
	"value": "cc-by-4.0"
	}
	]
	},
	"governance": {
	"owners": [
	{
	"organization": {
	"name": "legacy-datasets",
	"url": "https://huggingface.co/legacy-datasets"
	}
	}
	]
	},
	"description": "Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish."
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "mozilla-foundation/common_voice_8_0-a994a71f-f9f5-5f65-a3fa-51a56293cd8e",
	"name": "mozilla-foundation/common_voice_8_0",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "mozilla-foundation/common_voice_8_0-a994a71f-f9f5-5f65-a3fa-51a56293cd8e",
	"name": "mozilla-foundation/common_voice_8_0",
	"contents": {
	"url": "https://huggingface.co/datasets/mozilla-foundation/common_voice_8_0",
	"properties": [
	{
	"name": "task_categories",
	"value": "automatic-speech-recognition"
	},
	{
	"name": "annotations_creators",
	"value": "crowdsourced"
	},
	{
	"name": "language_creators",
	"value": "crowdsourced"
	},
	{
	"name": "pretty_name",
	"value": "Common Voice Corpus 8.0"
	},
	{
	"name": "source_datasets",
	"value": "extended\|common_voice"
	},
	{
	"name": "paperswithcode_id",
	"value": "common-voice"
	},
	{
	"name": "license",
	"value": "cc0-1.0"
	}
	]
	},
	"governance": {
	"owners": [
	{
	"organization": {
	"name": "mozilla-foundation",
	"url": "https://huggingface.co/mozilla-foundation"
	}
	}
	]
	},
	"description": "\n\t\n\t\t\n\t\tDataset Card for Common Voice Corpus 8.0\n\t\n\n\n\t\n\t\t\n\t\tDataset Summary\n\t\n\nThe Common Voice dataset consists of a unique MP3 and corresponding text file. \nMany of the 18243 recorded hours in the dataset also include demographic metadata like age, sex, and accent \nthat can help improve the accuracy of speech recognition engines.\nThe dataset currently consists of 14122 validated hours in 87 languages, but more voices and languages are always added. \nTake a look at the Languages page to\u2026 See the full description on the dataset page: https://huggingface.co/datasets/mozilla-foundation/common_voice_8_0."
	}
	]
	},
	{
	"type": "data",
	"bom-ref": "MLCommons/peoples_speech-f88dc766-1de0-51c6-865d-16930ec19be6",
	"name": "MLCommons/peoples_speech",
	"data": [
	{
	"type": "dataset",
	"bom-ref": "MLCommons/peoples_speech-f88dc766-1de0-51c6-865d-16930ec19be6",
	"name": "MLCommons/peoples_speech",
	"contents": {
	"url": "https://huggingface.co/datasets/MLCommons/peoples_speech",
	"properties": [
	{
	"name": "task_categories",
	"value": "automatic-speech-recognition"
	},
	{
	"name": "task_ids",
	"value": ""
	},
	{
	"name": "language",
	"value": "en"
	},
	{
	"name": "size_categories",
	"value": "1T<n"
	},
	{
	"name": "annotations_creators",
	"value": "crowdsourced, machine-generated"
	},
	{
	"name": "language_creators",
	"value": "crowdsourced, machine-generated"
	},
	{
	"name": "pretty_name",
	"value": "People's Speech"
	},
	{
	"name": "source_datasets",
	"value": "original"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: clean {\"split\": \"train\", \"path\": \"clean/train-\"}, {\"split\": \"validation\", \"path\": \"clean/validation-\"}, {\"split\": \"test\", \"path\": \"clean/test-*\"}"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: clean_sa {\"split\": \"train\", \"path\": \"clean_sa/train-\"}, {\"split\": \"validation\", \"path\": \"clean_sa/validation-\"}, {\"split\": \"test\", \"path\": \"clean_sa/test-*\"}"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: dirty {\"split\": \"train\", \"path\": \"dirty/train-\"}, {\"split\": \"validation\", \"path\": \"dirty/validation-\"}, {\"split\": \"test\", \"path\": \"dirty/test-*\"}"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: dirty_sa {\"split\": \"train\", \"path\": \"dirty_sa/train-\"}, {\"split\": \"validation\", \"path\": \"dirty_sa/validation-\"}, {\"split\": \"test\", \"path\": \"dirty_sa/test-*\"}"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: microset {\"split\": \"train\", \"path\": \"microset/train-*\"}"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: test {\"split\": \"test\", \"path\": \"test/test-*\"}"
	},
	{
	"name": "configs",
	"value": "Name of the dataset subset: validation {\"split\": \"validation\", \"path\": \"validation/validation-*\"}"
	},
	{
	"name": "license",
	"value": "cc-by-2.0, cc-by-2.5, cc-by-3.0, cc-by-4.0, cc-by-sa-3.0, cc-by-sa-4.0"
	}
	]
	},
	"governance": {
	"owners": [
	{
	"organization": {
	"name": "MLCommons",
	"url": "https://huggingface.co/MLCommons"
	}
	}
	]
	},
	"description": "\n\t\n\t\t\n\t\tDataset Card for People's Speech\n\t\n\n\n\t\n\t\t\n\t\tDataset Summary\n\t\n\nThe People's Speech Dataset is among the world's largest English speech recognition corpus today that is licensed for academic and commercial usage under CC-BY-SA and CC-BY 4.0. It includes 30,000+ hours of transcribed speech in English languages with a diverse set of speakers. This open dataset is large enough to train speech-to-text systems and crucially is available with a permissive license.\n\n\t\n\t\t\n\t\n\t\n\t\tSupported Tasks\u2026 See the full description on the dataset page: https://huggingface.co/datasets/MLCommons/peoples_speech."
	}
	]
	}
	]
	}