Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import argparse | |
| from . import config as CFG | |
| def build_arg_parser(): | |
| p = argparse.ArgumentParser(description="Train BPE tokenizer on Twitch chat Parquet dataset") | |
| p.add_argument('--data_path', default=CFG.DATA_PATH) | |
| p.add_argument('--text_column', default=CFG.TEXT_COLUMN) | |
| p.add_argument('--vocab_target', type=int, default=CFG.VOCAB_TARGET) | |
| p.add_argument('--val_size', type=int, default=CFG.VAL_SAMPLE_SIZE) | |
| p.add_argument('--lowercase', action='store_true') | |
| p.add_argument('--max_merge_steps', type=int, default=CFG.MAX_MERGE_STEPS) | |
| # Accept both underscore and hyphen variants for convenience | |
| p.add_argument('--fast_train', '--fast-train', dest='fast_train', action='store_true', help='Enable fast train mode (batched merges & sampling) for large corpora.') | |
| p.add_argument('--granularity', choices=['byte','char','word'], default=CFG.TOKEN_GRANULARITY, help='Pretokenization granularity: byte (default), char, or word.') | |
| p.add_argument('--ratio_metric', '--ratio-metric', choices=['fixed','entropy','huffman'], default=None, help='Override ratio metric used for training estimates and validation reporting.') | |
| p.add_argument('--bucket_cap', '--bucket-cap', type=int, default=None, help='Override BUCKET_CAP (max vocab allowed within a fast-train pass). Set to 0 to remove cap.') | |
| return p | |