NeMo_Canary / scripts /tokenizers /conf /huggingface_data_tokenizer.yaml

Upload folder using huggingface_hub

b386992 verified 6 months ago

1.21 kB


	# num workers to use for extracting text from datasets.
	num_workers: 8

	# simple text cleaning, by default converts all chars to lower-case and only keeps alpha-numeric chars.
	normalize_text: false
	symbols_to_keep: ["'"] # a list of symbols to keep during text cleaning.

	# the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
	text_key: "text" # the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
	num_proc: 4 # num processes to use for downloading HF datasets

	data_path: "librispeech_asr"
	data_name: null
	streaming: true

	hf_data_cfg: # hf_data_cfg can be a ListConfig or DictConfig. Params for each data are passed into huggingface load_dataset(). Add more params if needed
	- path: ${data_path}
	name: ${data_name}
	split: 'train.clean.360'
	streaming: ${streaming}
	num_proc: ${num_proc}
	- path: ${data_path}
	name: ${data_name}
	split: 'train.clean.100'
	streaming: ${streaming}
	num_proc: ${num_proc}
	- path: ${data_path}
	name: ${data_name}
	split: 'train.other.500'
	streaming: ${streaming}
	num_proc: ${num_proc}

	output_file: "librispeech_asr_train960.txt"