NeMo_Canary / scripts /tokenizers /conf /huggingface_data_tokenizer.yaml
Respair's picture
Upload folder using huggingface_hub
b386992 verified
# num workers to use for extracting text from datasets.
num_workers: 8
# simple text cleaning, by default converts all chars to lower-case and only keeps alpha-numeric chars.
normalize_text: false
symbols_to_keep: ["'"] # a list of symbols to keep during text cleaning.
# the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
text_key: "text" # the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
num_proc: 4 # num processes to use for downloading HF datasets
data_path: "librispeech_asr"
data_name: null
streaming: true
hf_data_cfg: # hf_data_cfg can be a ListConfig or DictConfig. Params for each data are passed into huggingface load_dataset(). Add more params if needed
- path: ${data_path}
name: ${data_name}
split: 'train.clean.360'
streaming: ${streaming}
num_proc: ${num_proc}
- path: ${data_path}
name: ${data_name}
split: 'train.clean.100'
streaming: ${streaming}
num_proc: ${num_proc}
- path: ${data_path}
name: ${data_name}
split: 'train.other.500'
streaming: ${streaming}
num_proc: ${num_proc}
output_file: "librispeech_asr_train960.txt"