Upload folder using huggingface_hub

#28
source/tokenizer/convert_sp_to_hf.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ tokenizer/convert_sp_to_hf.py — SentencePiece 모델을 HuggingFace tokenizers.json으로 변환.
4
+
5
+ prepare.py의 load_tokenizer()는 Tokenizer.from_file()을 사용하므로
6
+ SentencePiece .model을 직접 읽지 못함 → HF tokenizers 포맷으로 변환 필요.
7
+
8
+ Usage:
9
+ python tokenizer/convert_sp_to_hf.py \
10
+ --model tokenizer/korean_sp/tokenizer.model \
11
+ --output tokenizer/korean_sp/tokenizer.json
12
+
13
+ Requirements:
14
+ pip install --break-system-packages sentencepiece tokenizers transformers
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import argparse
20
+ import json
21
+ import sys
22
+ from pathlib import Path
23
+
24
+
25
+ def convert(model_path: Path, output_path: Path) -> None:
26
+ """SentencePiece Unigram 모델을 HuggingFace tokenizers.json으로 변환."""
27
+
28
+ # 방법 1: transformers의 XLNetTokenizer 계열 변환기 활용
29
+ # (더 완전한 변환, special token 처리 포함)
30
+ try:
31
+ from transformers.convert_slow_tokenizer import SpmConverter
32
+ from tokenizers import Tokenizer
33
+ from tokenizers.models import Unigram
34
+
35
+ print(f"변환 중: {model_path} → {output_path}")
36
+
37
+ # SpmConverter는 tokenizers 라이브러리의 Unigram 모델로 변환
38
+ # sentencepiece 모델 로드
39
+ import sentencepiece as spm
40
+ sp = spm.SentencePieceProcessor()
41
+ sp.load(str(model_path))
42
+
43
+ vocab_size = sp.vocab_size()
44
+ print(f"어휘 크기: {vocab_size:,}")
45
+
46
+ # Unigram vocab 추출: (piece, score) 목록
47
+ vocab: list[tuple[str, float]] = []
48
+ for i in range(vocab_size):
49
+ piece = sp.id_to_piece(i)
50
+ score = sp.get_score(i)
51
+ vocab.append((piece, score))
52
+
53
+ # HuggingFace Unigram 모델 생성
54
+ # unk_id 확인
55
+ unk_id = sp.unk_id()
56
+
57
+ tokenizer = Tokenizer(Unigram(vocab, unk_id=unk_id))
58
+
59
+ # Pre-tokenizer: Metaspace (SentencePiece 방식 — 공백을 ▁로 변환)
60
+ # tokenizers >= 0.14: add_prefix_space → prepend_scheme='always'
61
+ from tokenizers.pre_tokenizers import Metaspace
62
+ tokenizer.pre_tokenizer = Metaspace(replacement="▁", prepend_scheme="always")
63
+
64
+ # Decoder: Metaspace (역변환)
65
+ from tokenizers.decoders import Metaspace as MetaspaceDecoder
66
+ tokenizer.decoder = MetaspaceDecoder(replacement="▁", prepend_scheme="always")
67
+
68
+ # Special token 설정 (SP 모델과 동일한 ID)
69
+ from tokenizers import AddedToken
70
+ pad_id = sp.pad_id() if sp.pad_id() >= 0 else 0
71
+ bos_id = sp.bos_id() if sp.bos_id() >= 0 else 1
72
+ eos_id = sp.eos_id() if sp.eos_id() >= 0 else 2
73
+
74
+ tokenizer.add_special_tokens([
75
+ AddedToken("<pad>", special=True),
76
+ AddedToken("<s>", special=True),
77
+ AddedToken("</s>", special=True),
78
+ AddedToken("<unk>", special=True),
79
+ ])
80
+
81
+ output_path.parent.mkdir(parents=True, exist_ok=True)
82
+ tokenizer.save(str(output_path))
83
+
84
+ # 저장 후 검증
85
+ loaded = Tokenizer.from_file(str(output_path))
86
+ test_text = "안녕하세요, 한국어 언어 모델입니다."
87
+ encoded = loaded.encode(test_text)
88
+ print(f"\n검증 통과:")
89
+ print(f" 테스트 문자: {test_text!r}")
90
+ print(f" 토큰 수: {len(encoded.ids)}")
91
+ print(f" 토큰: {encoded.tokens[:15]}{'...' if len(encoded.tokens) > 15 else ''}")
92
+ print(f"\n저장 완료: {output_path}")
93
+
94
+ except ImportError as e:
95
+ print(f"ERROR: 필요한 라이브러리 없음: {e}", file=sys.stderr)
96
+ print(" pip install --break-system-packages sentencepiece tokenizers transformers", file=sys.stderr)
97
+ sys.exit(1)
98
+ except Exception as e:
99
+ print(f"ERROR: 변환 실패: {e}", file=sys.stderr)
100
+ import traceback
101
+ traceback.print_exc()
102
+ sys.exit(1)
103
+
104
+
105
+ def parse_args() -> argparse.Namespace:
106
+ parser = argparse.ArgumentParser(
107
+ description="SentencePiece 모델 → HuggingFace tokenizers.json 변환",
108
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
109
+ )
110
+ parser.add_argument(
111
+ "--model",
112
+ type=Path,
113
+ required=True,
114
+ help="SentencePiece .model 파일 경로",
115
+ )
116
+ parser.add_argument(
117
+ "--output",
118
+ type=Path,
119
+ required=True,
120
+ help="출력 tokenizers.json 경로",
121
+ )
122
+ return parser.parse_args()
123
+
124
+
125
+ def main() -> None:
126
+ args = parse_args()
127
+ if not args.model.exists():
128
+ print(f"ERROR: 모델 파일 없음: {args.model}", file=sys.stderr)
129
+ sys.exit(1)
130
+ convert(args.model, args.output)
131
+
132
+
133
+ if __name__ == "__main__":
134
+ main()
source/tokenizer/korean_sp/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
source/tokenizer/korean_sp/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edcf1eaa0a5ba871302ff42df9f80d1d0baa166ff2a57f4392c29145796bc7b2
3
+ size 1424163
source/tokenizer/korean_sp/tokenizer.vocab ADDED
The diff for this file is too large to render. See raw diff
 
source/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
source/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
source/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }
source/tokenizer/train_sp_tokenizer.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ tokenizer/train_sp_tokenizer.py — SentencePiece Unigram 한국어 토크나이저 학습.
4
+
5
+ 한국어 1음절(UTF-8 3바이트) = 1토큰이 되도록 Unigram 모델을 사용.
6
+ character_coverage=0.9995로 한글 11,172 음절 전체 커버.
7
+
8
+ Usage:
9
+ python tokenizer/train_sp_tokenizer.py \
10
+ --input "data/raw/namuwiki_ko/*.txt,data/raw/ko_wiki_0000.txt" \
11
+ --vocab_size 64000 \
12
+ --output_dir tokenizer/korean_sp
13
+
14
+ Output:
15
+ tokenizer/korean_sp/tokenizer.model (SentencePiece 모델)
16
+ tokenizer/korean_sp/tokenizer.vocab (어휘 목록)
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import glob
23
+ import os
24
+ import sys
25
+ import tempfile
26
+ from pathlib import Path
27
+
28
+
29
+ def expand_inputs(input_spec: str) -> list[str]:
30
+ """콤마로 구분된 글로브 패턴들을 실제 파일 경로 목록으로 확장."""
31
+ files: list[str] = []
32
+ for pattern in input_spec.split(","):
33
+ pattern = pattern.strip()
34
+ if any(c in pattern for c in ("*", "?", "[")):
35
+ matched = sorted(glob.glob(pattern, recursive=True))
36
+ if not matched:
37
+ print(f"WARNING: 패턴에 일치하는 파일 없음: {pattern!r}", file=sys.stderr)
38
+ files.extend(matched)
39
+ else:
40
+ if Path(pattern).exists():
41
+ files.append(pattern)
42
+ else:
43
+ print(f"WARNING: 파일 없음: {pattern!r}", file=sys.stderr)
44
+ return files
45
+
46
+
47
+ def train(
48
+ input_files: list[str],
49
+ output_dir: Path,
50
+ vocab_size: int,
51
+ num_threads: int,
52
+ input_sentence_size: int,
53
+ ) -> None:
54
+ try:
55
+ import sentencepiece as spm
56
+ except ImportError:
57
+ print(
58
+ "ERROR: sentencepiece가 설치되지 않음.\n"
59
+ " pip install --break-system-packages sentencepiece",
60
+ file=sys.stderr,
61
+ )
62
+ sys.exit(1)
63
+
64
+ output_dir.mkdir(parents=True, exist_ok=True)
65
+ model_prefix = str(output_dir / "tokenizer")
66
+
67
+ print(f"입력 파일 수: {len(input_files)}")
68
+ for f in input_files[:5]:
69
+ print(f" {f}")
70
+ if len(input_files) > 5:
71
+ print(f" ... 외 {len(input_files) - 5}개")
72
+ print(f"어휘 크기: {vocab_size:,}")
73
+ print(f"출력 경로: {model_prefix}.model / .vocab")
74
+ print()
75
+
76
+ # SentencePiece는 파일 목록을 콤마로 구분된 단일 문자열로 받는다
77
+ input_str = ",".join(input_files)
78
+
79
+ spm.SentencePieceTrainer.train(
80
+ input=input_str,
81
+ model_prefix=model_prefix,
82
+ vocab_size=vocab_size,
83
+ model_type="unigram", # BPE보다 한국어에 자연스러움
84
+ character_coverage=0.9995, # 한글 11,172 음절 완전 커버
85
+ normalization_rule_name="nfkc", # Unicode NFKC 정규화 (한국어 호환문자 통일)
86
+ pad_id=0,
87
+ bos_id=1,
88
+ eos_id=2,
89
+ unk_id=3,
90
+ pad_piece="<pad>",
91
+ bos_piece="<s>",
92
+ eos_piece="</s>",
93
+ unk_piece="<unk>",
94
+ user_defined_symbols=[],
95
+ num_threads=num_threads,
96
+ input_sentence_size=input_sentence_size,
97
+ shuffle_input_sentence=True,
98
+ # 학습 안정성
99
+ seed_sentencepiece_size=1_000_000,
100
+ shrinking_factor=0.75,
101
+ max_sentence_length=4096,
102
+ )
103
+
104
+ model_path = Path(f"{model_prefix}.model")
105
+ vocab_path = Path(f"{model_prefix}.vocab")
106
+
107
+ if model_path.exists():
108
+ size_mb = model_path.stat().st_size / 1e6
109
+ print(f"학습 완료!")
110
+ print(f" 모델: {model_path} ({size_mb:.1f} MB)")
111
+ print(f" 어휘: {vocab_path}")
112
+ print()
113
+ print("다음 단계:")
114
+ print(f" python tokenizer/convert_sp_to_hf.py \\")
115
+ print(f" --model {model_path} \\")
116
+ print(f" --output {output_dir}/tokenizer.json")
117
+ else:
118
+ print("ERROR: 학습 실패 — 출력 파일이 생성되지 않음", file=sys.stderr)
119
+ sys.exit(1)
120
+
121
+
122
+ def parse_args() -> argparse.Namespace:
123
+ parser = argparse.ArgumentParser(
124
+ description="SentencePiece Unigram 한국어 토크나이저 학습",
125
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
126
+ )
127
+ parser.add_argument(
128
+ "--input",
129
+ required=True,
130
+ help="콤마로 구분된 파일/글로브 패턴 (예: 'data/raw/ko/*.txt,data/raw/wiki.txt')",
131
+ )
132
+ parser.add_argument(
133
+ "--vocab_size",
134
+ type=int,
135
+ default=64000,
136
+ help="어휘 크기",
137
+ )
138
+ parser.add_argument(
139
+ "--output_dir",
140
+ type=Path,
141
+ default=Path("tokenizer/korean_sp"),
142
+ help="모델 저장 디렉토리",
143
+ )
144
+ parser.add_argument(
145
+ "--num_threads",
146
+ type=int,
147
+ default=64,
148
+ help="학습에 사용할 CPU 스레드 수",
149
+ )
150
+ parser.add_argument(
151
+ "--input_sentence_size",
152
+ type=int,
153
+ default=10_000_000,
154
+ help="학습에 사용할 최대 문장 수 (0 = 무제한)",
155
+ )
156
+ return parser.parse_args()
157
+
158
+
159
+ def main() -> None:
160
+ args = parse_args()
161
+ input_files = expand_inputs(args.input)
162
+ if not input_files:
163
+ print("ERROR: 입력 파일이 없습니다.", file=sys.stderr)
164
+ sys.exit(1)
165
+ train(
166
+ input_files=input_files,
167
+ output_dir=args.output_dir,
168
+ vocab_size=args.vocab_size,
169
+ num_threads=args.num_threads,
170
+ input_sentence_size=args.input_sentence_size,
171
+ )
172
+
173
+
174
+ if __name__ == "__main__":
175
+ main()
source/tokenizer/train_tokenizer.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Train a Byte-Level BPE tokenizer on raw text files.
3
+
4
+ The tokenizer is saved in two formats:
5
+ 1. Native HuggingFace ``tokenizers`` format (vocab.json + merges.txt) inside
6
+ the output directory — for fast loading with ByteLevelBPETokenizer.
7
+ 2. A ``tokenizer.json`` file (PreTrainedTokenizerFast) in the output directory
8
+ — for easy loading with transformers.AutoTokenizer.
9
+
10
+ Usage:
11
+ python tokenizer/train_tokenizer.py \
12
+ --input "data/raw/*.txt" \
13
+ --output tokenizer/ \
14
+ --vocab_size 32000 \
15
+ --min_frequency 2
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import argparse
21
+ import glob
22
+ import os
23
+ import sys
24
+ from pathlib import Path
25
+
26
+ from tokenizers import AddedToken
27
+ from tokenizers.implementations import ByteLevelBPETokenizer
28
+ from transformers import PreTrainedTokenizerFast
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Special tokens
33
+ # ---------------------------------------------------------------------------
34
+ SPECIAL_TOKENS: list[str] = ["<pad>", "<s>", "</s>", "<unk>"]
35
+
36
+
37
+ # ---------------------------------------------------------------------------
38
+ # Helpers
39
+ # ---------------------------------------------------------------------------
40
+
41
+ def find_input_files(pattern: str) -> list[str]:
42
+ """Resolve a glob pattern or a plain file path to a sorted list of paths."""
43
+ if any(c in pattern for c in ("*", "?", "[")):
44
+ files = sorted(glob.glob(pattern, recursive=True))
45
+ else:
46
+ files = [pattern] if Path(pattern).exists() else []
47
+ if not files:
48
+ raise FileNotFoundError(f"No files matched pattern: {pattern!r}")
49
+ return files
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Main
54
+ # ---------------------------------------------------------------------------
55
+
56
+ def parse_args() -> argparse.Namespace:
57
+ parser = argparse.ArgumentParser(
58
+ description="Train a Byte-Level BPE tokenizer and save to disk."
59
+ )
60
+ parser.add_argument(
61
+ "--input",
62
+ required=True,
63
+ help='Glob pattern for training text files, e.g. "data/raw/*.txt"',
64
+ )
65
+ parser.add_argument(
66
+ "--output",
67
+ default="tokenizer/",
68
+ help="Output directory for the trained tokenizer (default: tokenizer/)",
69
+ )
70
+ parser.add_argument(
71
+ "--vocab_size",
72
+ type=int,
73
+ default=32000,
74
+ help="Target vocabulary size (default: 32000)",
75
+ )
76
+ parser.add_argument(
77
+ "--min_frequency",
78
+ type=int,
79
+ default=2,
80
+ help="Minimum frequency for a pair to be merged (default: 2)",
81
+ )
82
+ return parser.parse_args()
83
+
84
+
85
+ def main() -> None:
86
+ args = parse_args()
87
+
88
+ # ---- Discover input files ----
89
+ input_files = find_input_files(args.input)
90
+ print(f"Found {len(input_files)} training file(s).")
91
+
92
+ # ---- Create output directory ----
93
+ output_dir = Path(args.output)
94
+ output_dir.mkdir(parents=True, exist_ok=True)
95
+
96
+ # ---- Initialise tokenizer ----
97
+ tokenizer = ByteLevelBPETokenizer()
98
+
99
+ # ---- Train ----
100
+ print(
101
+ f"\nTraining BPE tokenizer | vocab_size={args.vocab_size} "
102
+ f"| min_frequency={args.min_frequency} ..."
103
+ )
104
+ tokenizer.train(
105
+ files=input_files,
106
+ vocab_size=args.vocab_size,
107
+ min_frequency=args.min_frequency,
108
+ special_tokens=SPECIAL_TOKENS,
109
+ show_progress=True,
110
+ )
111
+
112
+ # ---- Add special tokens explicitly (ensures they have the right IDs) ----
113
+ tokenizer.add_special_tokens(SPECIAL_TOKENS)
114
+
115
+ # ---- Save native format (vocab.json + merges.txt) ----
116
+ tokenizer.save_model(str(output_dir))
117
+ print(f"\nSaved vocab.json + merges.txt to: {output_dir}")
118
+
119
+ # ---- Wrap in PreTrainedTokenizerFast and save tokenizer.json ----
120
+ fast_tokenizer = PreTrainedTokenizerFast(
121
+ tokenizer_object=tokenizer._tokenizer,
122
+ bos_token="<s>",
123
+ eos_token="</s>",
124
+ unk_token="<unk>",
125
+ pad_token="<pad>",
126
+ )
127
+ tokenizer_json_path = output_dir / "tokenizer.json"
128
+ fast_tokenizer.save_pretrained(str(output_dir))
129
+ print(f"Saved PreTrainedTokenizerFast to: {output_dir}")
130
+ print(f" -> tokenizer.json: {tokenizer_json_path}")
131
+
132
+ # ---- Stats ----
133
+ actual_vocab_size = tokenizer.get_vocab_size()
134
+ print("\n" + "=" * 50)
135
+ print("Tokenizer training statistics")
136
+ print("=" * 50)
137
+ print(f" Training files : {len(input_files):>10,}")
138
+ print(f" Target vocab : {args.vocab_size:>10,}")
139
+ print(f" Actual vocab : {actual_vocab_size:>10,}")
140
+ print(f" Min frequency : {args.min_frequency:>10,}")
141
+ print(f" Special tokens : {SPECIAL_TOKENS}")
142
+ print(f" Output dir : {output_dir.resolve()}")
143
+ print("=" * 50)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ main()
source/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff