| import json |
| import glob |
| import os |
| from pathlib import Path |
| import sys |
|
|
| |
| wd = Path(__file__).parent.parent.resolve() |
| sys.path.append(str(wd)) |
|
|
| import numpy as np |
| from tqdm import tqdm |
|
|
| from lit_llama import Tokenizer |
| import lit_llama.packed_dataset as packed_dataset |
|
|
|
|
| filenames_sample = [ |
| "arxiv_sample.jsonl", |
| "book_sample.jsonl", |
| "c4_sample.jsonl", |
| "cc_2019-30_sample.jsonl", |
| "cc_2020-05_sample.jsonl", |
| "cc_2021-04_sample.jsonl", |
| "cc_2022-05_sample.jsonl", |
| "cc_2023-06_sample.jsonl", |
| "github_sample.jsonl", |
| "stackexchange_sample.jsonl", |
| "wikipedia_sample.jsonl", |
| ] |
|
|
| filename_sets = { |
| "arxiv": "arxiv/arxiv*", |
| "book": "book/book*", |
| "c4": "c4/c4-train*", |
| "common_crawl": "common_crawl/*", |
| "github": "github/filtered*", |
| "stackexchange": "stackexchange/stackexchange*", |
| "wikipedia": "wikipedia/wiki*", |
| } |
|
|
|
|
| def prepare_sample( |
| source_path: Path, |
| tokenizer_path: Path, |
| destination_path: Path, |
| chunk_size: int, |
| match = "" |
| ) -> None: |
| """Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model).""" |
| destination_path.mkdir(parents=True, exist_ok=True) |
|
|
| tokenizer = Tokenizer(tokenizer_path) |
|
|
| for name in filenames_sample: |
| if match and match not in name: |
| continue |
|
|
| filepath = source_path / name |
|
|
| if not filepath.is_file(): |
| raise RuntimeError( |
| f"Input file not found at {filepath}. \n" |
| "Make sure you download the data, e.g. wget -i https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through \n" |
| "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T \n" |
| "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" |
| ) |
|
|
| prefix, _ = os.path.splitext(name) |
|
|
| builder = packed_dataset.PackedDatasetBuilder( |
| outdir=destination_path, |
| prefix=prefix, |
| chunk_size=chunk_size, |
| sep_token=tokenizer.bos_id, |
| dtype="auto", |
| vocab_size=tokenizer.vocab_size, |
| ) |
|
|
| print(f"Processing {name}") |
|
|
| with open(filepath, encoding="utf-8") as f: |
| for row in tqdm(f): |
| text = json.loads(row)["text"] |
| text_ids = tokenizer.encode(text) |
| builder.add_array(np.array(text_ids, dtype=builder.dtype)) |
|
|
| builder.write_reminder() |
|
|
|
|
| def prepare_full( |
| source_path: Path, |
| tokenizer_path: Path, |
| destination_path: Path, |
| chunk_size: int, |
| match: str = "" |
| ) -> None: |
| """Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model).""" |
| import zstandard as zstd |
|
|
| destination_path.mkdir(parents=True, exist_ok=True) |
|
|
| tokenizer = Tokenizer(tokenizer_path) |
|
|
| for set_name, pattern in filename_sets.items(): |
| if match and match not in set_name: |
| continue |
|
|
| is_cc = set_name == "common_crawl" |
|
|
| filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) |
|
|
| if not filenames: |
| raise RuntimeError( |
| f"No files matching {pattern} found at {source_path}. \n" |
| "Make sure you download the data, e.g. wget -i https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through \n" |
| "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T \n" |
| "https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" |
| ) |
|
|
| builder = packed_dataset.PackedDatasetBuilder( |
| outdir=destination_path, |
| prefix=set_name, |
| chunk_size=chunk_size, |
| sep_token=tokenizer.bos_id, |
| dtype="auto", |
| vocab_size=tokenizer.vocab_size, |
| ) |
|
|
| for name in filenames: |
| filepath = source_path / name |
|
|
| print(f"Processing {name}") |
|
|
| if is_cc: |
| with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: |
| for row in tqdm(f): |
| text = json.loads(row)["text"] |
| text_ids = tokenizer.encode(text) |
| builder.add_array(np.array(text_ids, dtype=builder.dtype)) |
| else: |
| with open(filepath, encoding="utf-8") as f: |
| for row in tqdm(f): |
| text = json.loads(row)["text"] |
| text_ids = tokenizer.encode(text) |
| builder.add_array(np.array(text_ids, dtype=builder.dtype)) |
|
|
| builder.write_reminder() |
|
|
|
|
| def prepare( |
| source_path: Path = Path("data/RedPajama-Data-1T-Sample"), |
| tokenizer_path: Path = Path("checkpoints/lit-llama/tokenizer.model"), |
| destination_path: Path = Path("data/red_pajama_sample"), |
| chunk_size: int = 2049 * 1024, |
| sample: bool = False, |
| match: str = "", |
| ) -> None: |
| """Prepare the "Red Pajama" dataset. We assume tokenizer has been trained (i.e. we reuse LLaMA's tokenizer model).""" |
| if sample: |
| prepare_sample( |
| source_path=source_path, |
| tokenizer_path=tokenizer_path, |
| destination_path=destination_path, |
| chunk_size=chunk_size, |
| match=match, |
| ) |
| else: |
| prepare_full( |
| source_path=source_path, |
| tokenizer_path=tokenizer_path, |
| destination_path=destination_path, |
| chunk_size=chunk_size, |
| match=match, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| from jsonargparse import CLI |
|
|
| CLI(prepare) |
|
|