# num workers to use for extracting text from datasets. num_workers: 8 # simple text cleaning, by default converts all chars to lower-case and only keeps alpha-numeric chars. normalize_text: false symbols_to_keep: ["'"] # a list of symbols to keep during text cleaning. # the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text" text_key: "text" # the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text" num_proc: 4 # num processes to use for downloading HF datasets data_path: "librispeech_asr" data_name: null streaming: true hf_data_cfg: # hf_data_cfg can be a ListConfig or DictConfig. Params for each data are passed into huggingface load_dataset(). Add more params if needed - path: ${data_path} name: ${data_name} split: 'train.clean.360' streaming: ${streaming} num_proc: ${num_proc} - path: ${data_path} name: ${data_name} split: 'train.clean.100' streaming: ${streaming} num_proc: ${num_proc} - path: ${data_path} name: ${data_name} split: 'train.other.500' streaming: ${streaming} num_proc: ${num_proc} output_file: "librispeech_asr_train960.txt"