frankenstallm / source /tokenizer /convert_sp_to_hf.py
pathcosmos's picture
Upload folder using huggingface_hub (#18)
5df4ae4
#!/usr/bin/env python3
"""
tokenizer/convert_sp_to_hf.py โ€” SentencePiece ๋ชจ๋ธ์„ HuggingFace tokenizers.json์œผ๋กœ ๋ณ€ํ™˜.
prepare.py์˜ load_tokenizer()๋Š” Tokenizer.from_file()์„ ์‚ฌ์šฉํ•˜๋ฏ€๋กœ
SentencePiece .model์„ ์ง์ ‘ ์ฝ์ง€ ๋ชปํ•จ โ†’ HF tokenizers ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ ํ•„์š”.
Usage:
python tokenizer/convert_sp_to_hf.py \
--model tokenizer/korean_sp/tokenizer.model \
--output tokenizer/korean_sp/tokenizer.json
Requirements:
pip install --break-system-packages sentencepiece tokenizers transformers
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
def convert(model_path: Path, output_path: Path) -> None:
"""SentencePiece Unigram ๋ชจ๋ธ์„ HuggingFace tokenizers.json์œผ๋กœ ๋ณ€ํ™˜."""
# ๋ฐฉ๋ฒ• 1: transformers์˜ XLNetTokenizer ๊ณ„์—ด ๋ณ€ํ™˜๊ธฐ ํ™œ์šฉ
# (๋” ์™„์ „ํ•œ ๋ณ€ํ™˜, special token ์ฒ˜๋ฆฌ ํฌํ•จ)
try:
from transformers.convert_slow_tokenizer import SpmConverter
from tokenizers import Tokenizer
from tokenizers.models import Unigram
print(f"๋ณ€ํ™˜ ์ค‘: {model_path} โ†’ {output_path}")
# SpmConverter๋Š” tokenizers ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ์˜ Unigram ๋ชจ๋ธ๋กœ ๋ณ€ํ™˜
# sentencepiece ๋ชจ๋ธ ๋กœ๋“œ
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load(str(model_path))
vocab_size = sp.vocab_size()
print(f"์–ดํœ˜ ํฌ๊ธฐ: {vocab_size:,}")
# Unigram vocab ์ถ”์ถœ: (piece, score) ๋ชฉ๋ก
vocab: list[tuple[str, float]] = []
for i in range(vocab_size):
piece = sp.id_to_piece(i)
score = sp.get_score(i)
vocab.append((piece, score))
# HuggingFace Unigram ๋ชจ๋ธ ์ƒ์„ฑ
# unk_id ํ™•์ธ
unk_id = sp.unk_id()
tokenizer = Tokenizer(Unigram(vocab, unk_id=unk_id))
# Pre-tokenizer: Metaspace (SentencePiece ๋ฐฉ์‹ โ€” ๊ณต๋ฐฑ์„ โ–๋กœ ๋ณ€ํ™˜)
# tokenizers >= 0.14: add_prefix_space โ†’ prepend_scheme='always'
from tokenizers.pre_tokenizers import Metaspace
tokenizer.pre_tokenizer = Metaspace(replacement="โ–", prepend_scheme="always")
# Decoder: Metaspace (์—ญ๋ณ€ํ™˜)
from tokenizers.decoders import Metaspace as MetaspaceDecoder
tokenizer.decoder = MetaspaceDecoder(replacement="โ–", prepend_scheme="always")
# Special token ์„ค์ • (SP ๋ชจ๋ธ๊ณผ ๋™์ผํ•œ ID)
from tokenizers import AddedToken
pad_id = sp.pad_id() if sp.pad_id() >= 0 else 0
bos_id = sp.bos_id() if sp.bos_id() >= 0 else 1
eos_id = sp.eos_id() if sp.eos_id() >= 0 else 2
tokenizer.add_special_tokens([
AddedToken("<pad>", special=True),
AddedToken("<s>", special=True),
AddedToken("</s>", special=True),
AddedToken("<unk>", special=True),
])
output_path.parent.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(output_path))
# ์ €์žฅ ํ›„ ๊ฒ€์ฆ
loaded = Tokenizer.from_file(str(output_path))
test_text = "์•ˆ๋…•ํ•˜์„ธ์š”, ํ•œ๊ตญ์–ด ์–ธ์–ด ๋ชจ๋ธ์ž…๋‹ˆ๋‹ค."
encoded = loaded.encode(test_text)
print(f"\n๊ฒ€์ฆ ํ†ต๊ณผ:")
print(f" ํ…Œ์ŠคํŠธ ๋ฌธ์ž: {test_text!r}")
print(f" ํ† ํฐ ์ˆ˜: {len(encoded.ids)}")
print(f" ํ† ํฐ: {encoded.tokens[:15]}{'...' if len(encoded.tokens) > 15 else ''}")
print(f"\n์ €์žฅ ์™„๋ฃŒ: {output_path}")
except ImportError as e:
print(f"ERROR: ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์—†์Œ: {e}", file=sys.stderr)
print(" pip install --break-system-packages sentencepiece tokenizers transformers", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"ERROR: ๋ณ€ํ™˜ ์‹คํŒจ: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="SentencePiece ๋ชจ๋ธ โ†’ HuggingFace tokenizers.json ๋ณ€ํ™˜",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--model",
type=Path,
required=True,
help="SentencePiece .model ํŒŒ์ผ ๊ฒฝ๋กœ",
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="์ถœ๋ ฅ tokenizers.json ๊ฒฝ๋กœ",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
if not args.model.exists():
print(f"ERROR: ๋ชจ๋ธ ํŒŒ์ผ ์—†์Œ: {args.model}", file=sys.stderr)
sys.exit(1)
convert(args.model, args.output)
if __name__ == "__main__":
main()