| |
| """ |
| tokenizer/convert_sp_to_hf.py โ SentencePiece ๋ชจ๋ธ์ HuggingFace tokenizers.json์ผ๋ก ๋ณํ. |
| |
| prepare.py์ load_tokenizer()๋ Tokenizer.from_file()์ ์ฌ์ฉํ๋ฏ๋ก |
| SentencePiece .model์ ์ง์ ์ฝ์ง ๋ชปํจ โ HF tokenizers ํฌ๋งท์ผ๋ก ๋ณํ ํ์. |
| |
| Usage: |
| python tokenizer/convert_sp_to_hf.py \ |
| --model tokenizer/korean_sp/tokenizer.model \ |
| --output tokenizer/korean_sp/tokenizer.json |
| |
| Requirements: |
| pip install --break-system-packages sentencepiece tokenizers transformers |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from pathlib import Path |
|
|
|
|
| def convert(model_path: Path, output_path: Path) -> None: |
| """SentencePiece Unigram ๋ชจ๋ธ์ HuggingFace tokenizers.json์ผ๋ก ๋ณํ.""" |
|
|
| |
| |
| try: |
| from transformers.convert_slow_tokenizer import SpmConverter |
| from tokenizers import Tokenizer |
| from tokenizers.models import Unigram |
|
|
| print(f"๋ณํ ์ค: {model_path} โ {output_path}") |
|
|
| |
| |
| import sentencepiece as spm |
| sp = spm.SentencePieceProcessor() |
| sp.load(str(model_path)) |
|
|
| vocab_size = sp.vocab_size() |
| print(f"์ดํ ํฌ๊ธฐ: {vocab_size:,}") |
|
|
| |
| vocab: list[tuple[str, float]] = [] |
| for i in range(vocab_size): |
| piece = sp.id_to_piece(i) |
| score = sp.get_score(i) |
| vocab.append((piece, score)) |
|
|
| |
| |
| unk_id = sp.unk_id() |
|
|
| tokenizer = Tokenizer(Unigram(vocab, unk_id=unk_id)) |
|
|
| |
| |
| from tokenizers.pre_tokenizers import Metaspace |
| tokenizer.pre_tokenizer = Metaspace(replacement="โ", prepend_scheme="always") |
|
|
| |
| from tokenizers.decoders import Metaspace as MetaspaceDecoder |
| tokenizer.decoder = MetaspaceDecoder(replacement="โ", prepend_scheme="always") |
|
|
| |
| from tokenizers import AddedToken |
| pad_id = sp.pad_id() if sp.pad_id() >= 0 else 0 |
| bos_id = sp.bos_id() if sp.bos_id() >= 0 else 1 |
| eos_id = sp.eos_id() if sp.eos_id() >= 0 else 2 |
|
|
| tokenizer.add_special_tokens([ |
| AddedToken("<pad>", special=True), |
| AddedToken("<s>", special=True), |
| AddedToken("</s>", special=True), |
| AddedToken("<unk>", special=True), |
| ]) |
|
|
| output_path.parent.mkdir(parents=True, exist_ok=True) |
| tokenizer.save(str(output_path)) |
|
|
| |
| loaded = Tokenizer.from_file(str(output_path)) |
| test_text = "์๋
ํ์ธ์, ํ๊ตญ์ด ์ธ์ด ๋ชจ๋ธ์
๋๋ค." |
| encoded = loaded.encode(test_text) |
| print(f"\n๊ฒ์ฆ ํต๊ณผ:") |
| print(f" ํ
์คํธ ๋ฌธ์: {test_text!r}") |
| print(f" ํ ํฐ ์: {len(encoded.ids)}") |
| print(f" ํ ํฐ: {encoded.tokens[:15]}{'...' if len(encoded.tokens) > 15 else ''}") |
| print(f"\n์ ์ฅ ์๋ฃ: {output_path}") |
|
|
| except ImportError as e: |
| print(f"ERROR: ํ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์์: {e}", file=sys.stderr) |
| print(" pip install --break-system-packages sentencepiece tokenizers transformers", file=sys.stderr) |
| sys.exit(1) |
| except Exception as e: |
| print(f"ERROR: ๋ณํ ์คํจ: {e}", file=sys.stderr) |
| import traceback |
| traceback.print_exc() |
| sys.exit(1) |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser( |
| description="SentencePiece ๋ชจ๋ธ โ HuggingFace tokenizers.json ๋ณํ", |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, |
| ) |
| parser.add_argument( |
| "--model", |
| type=Path, |
| required=True, |
| help="SentencePiece .model ํ์ผ ๊ฒฝ๋ก", |
| ) |
| parser.add_argument( |
| "--output", |
| type=Path, |
| required=True, |
| help="์ถ๋ ฅ tokenizers.json ๊ฒฝ๋ก", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
| if not args.model.exists(): |
| print(f"ERROR: ๋ชจ๋ธ ํ์ผ ์์: {args.model}", file=sys.stderr) |
| sys.exit(1) |
| convert(args.model, args.output) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|