# num workers to use for extracting text from datasets.
num_workers: 8

# simple text cleaning, by default converts all chars to lower-case and only keeps alpha-numeric chars.
normalize_text: false
symbols_to_keep: ["'"]  # a list of symbols to keep during text cleaning.

# the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
text_key:  "text"  # the key for groundtruth transcription, e.g., MCV usually uses "sentence" while some others use "text"
num_proc: 4  # num processes to use for downloading HF datasets

data_path: "librispeech_asr"
data_name: null
streaming: true

hf_data_cfg:  # hf_data_cfg can be a ListConfig or DictConfig. Params for each data are passed into huggingface load_dataset(). Add more params if needed
  - path: ${data_path}
    name: ${data_name}
    split: 'train.clean.360'
    streaming: ${streaming}
    num_proc: ${num_proc}
  - path: ${data_path}
    name: ${data_name}
    split: 'train.clean.100'
    streaming: ${streaming}
    num_proc: ${num_proc}
  - path: ${data_path}
    name: ${data_name}
    split: 'train.other.500'
    streaming: ${streaming}
    num_proc: ${num_proc}

output_file: "librispeech_asr_train960.txt"