Spaces:
Sleeping
Sleeping
File size: 1,402 Bytes
c6e5251 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
from __future__ import annotations
import argparse
from . import config as CFG
def build_arg_parser():
p = argparse.ArgumentParser(description="Train BPE tokenizer on Twitch chat Parquet dataset")
p.add_argument('--data_path', default=CFG.DATA_PATH)
p.add_argument('--text_column', default=CFG.TEXT_COLUMN)
p.add_argument('--vocab_target', type=int, default=CFG.VOCAB_TARGET)
p.add_argument('--val_size', type=int, default=CFG.VAL_SAMPLE_SIZE)
p.add_argument('--lowercase', action='store_true')
p.add_argument('--max_merge_steps', type=int, default=CFG.MAX_MERGE_STEPS)
# Accept both underscore and hyphen variants for convenience
p.add_argument('--fast_train', '--fast-train', dest='fast_train', action='store_true', help='Enable fast train mode (batched merges & sampling) for large corpora.')
p.add_argument('--granularity', choices=['byte','char','word'], default=CFG.TOKEN_GRANULARITY, help='Pretokenization granularity: byte (default), char, or word.')
p.add_argument('--ratio_metric', '--ratio-metric', choices=['fixed','entropy','huffman'], default=None, help='Override ratio metric used for training estimates and validation reporting.')
p.add_argument('--bucket_cap', '--bucket-cap', type=int, default=None, help='Override BUCKET_CAP (max vocab allowed within a fast-train pass). Set to 0 to remove cap.')
return p
|