{ "num_docs": 20000, "seq_length": 1024, "tokenizer": "gpt2", "output_dir": "data", "min_text_length": 50, "max_text_length": 10000 }