File size: 1,402 Bytes
c6e5251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from __future__ import annotations
import argparse

from . import config as CFG


def build_arg_parser():
    p = argparse.ArgumentParser(description="Train BPE tokenizer on Twitch chat Parquet dataset")
    p.add_argument('--data_path', default=CFG.DATA_PATH)
    p.add_argument('--text_column', default=CFG.TEXT_COLUMN)
    p.add_argument('--vocab_target', type=int, default=CFG.VOCAB_TARGET)
    p.add_argument('--val_size', type=int, default=CFG.VAL_SAMPLE_SIZE)
    p.add_argument('--lowercase', action='store_true')
    p.add_argument('--max_merge_steps', type=int, default=CFG.MAX_MERGE_STEPS)
    # Accept both underscore and hyphen variants for convenience
    p.add_argument('--fast_train', '--fast-train', dest='fast_train', action='store_true', help='Enable fast train mode (batched merges & sampling) for large corpora.')
    p.add_argument('--granularity', choices=['byte','char','word'], default=CFG.TOKEN_GRANULARITY, help='Pretokenization granularity: byte (default), char, or word.')
    p.add_argument('--ratio_metric', '--ratio-metric', choices=['fixed','entropy','huffman'], default=None, help='Override ratio metric used for training estimates and validation reporting.')
    p.add_argument('--bucket_cap', '--bucket-cap', type=int, default=None, help='Override BUCKET_CAP (max vocab allowed within a fast-train pass). Set to 0 to remove cap.')
    return p