| """ | |
| Quick wikipedia download script from huggingface for quickstart purposes. | |
| Just downloads the 20220301 english wikipedia from huggingface and | |
| does no extra preprocessing. | |
| """ | |
| import argparse | |
| from datasets import load_dataset # huggingface | |
| import os | |
| def main(output_dir): | |
| os.makedirs(output_dir, exist_ok=True) | |
| data = load_dataset("wikipedia", "20220301.en") | |
| for split, dataset in data.items(): | |
| print("Processing split: %s" % data) | |
| output_file = os.path.join(output_dir, "wiki_en_20220301_%s.jsonl" % (split)) | |
| dataset.to_json(output_file) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--output-dir", | |
| type=str, | |
| required=True, | |
| help="Where to store the wikipedia .jsonl file", | |
| ) | |
| args = parser.parse_args() | |
| main(args.output_dir) | |