Helsinki-NLP/tatoeba_mt
Updated โข 5.19k โข 63
How to use sappho192/ffxiv-ja-ko-translator with Transformers:
# Use a pipeline as a high-level helper
# Warning: Pipeline type "translation" is no longer supported in transformers v5.
# You must load the model directly (see below) or downgrade to v4.x with:
# 'pip install "transformers<5.0.0'
from transformers import pipeline
pipe = pipeline("translation", model="sappho192/ffxiv-ja-ko-translator") # Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("sappho192/ffxiv-ja-ko-translator")
model = AutoModelForSeq2SeqLM.from_pretrained("sappho192/ffxiv-ja-ko-translator")FINAL FANTASY is a registered trademark of Square Enix Holdings Co., Ltd.
This project is detailed on the Github repo.
Click to try demo
Check this Windows app demo with ONNX model
from transformers import(
EncoderDecoderModel,
PreTrainedTokenizerFast,
BertJapaneseTokenizer,
)
import torch
encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"
src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name)
# You should change following `./best_model` to the path of model **directory**
model = EncoderDecoderModel.from_pretrained("./best_model")
text = "ใฎใซใฌใกใใทใฅ่จไผๆฆ"
# text = "ใฎใซใฌใกใใทใฅ่จไผๆฆใซ่กใฃใฆใใพใใไธ็ทใซ่กใใพใใใใ๏ผ"
def translate(text_src):
embeddings = src_tokenizer(text_src, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
embeddings = {k: v for k, v in embeddings.items()}
output = model.generate(**embeddings, max_length=500)[0, 1:-1]
text_trg = trg_tokenizer.decode(output.cpu())
return text_trg
print(translate(text))
Note that current Optimum.OnnxRuntime still requires PyTorch for backend. [Issue] You can use either [ONNX] or [quantized ONNX] model.
from transformers import BertJapaneseTokenizer,PreTrainedTokenizerFast
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from onnxruntime import SessionOptions
import torch
encoder_model_name = "cl-tohoku/bert-base-japanese-v2"
decoder_model_name = "skt/kogpt2-base-v2"
src_tokenizer = BertJapaneseTokenizer.from_pretrained(encoder_model_name)
trg_tokenizer = PreTrainedTokenizerFast.from_pretrained(decoder_model_name)
sess_options = SessionOptions()
sess_options.log_severity_level = 3 # mute warnings including CleanUnusedInitializersAndNodeArgs
# change subfolder to "onnxq" if you want to use the quantized model
model = ORTModelForSeq2SeqLM.from_pretrained("sappho192/ffxiv-ja-ko-translator",
sess_options=sess_options, subfolder="onnx")
texts = [
"้ใใ!", # Should be "๋๋ง์ณ!"
"ๅใใพใใฆ.", # "๋ฐ๊ฐ์์"
"ใใใใใ้กใใใพใ.", # "์ ๋ถํ๋๋ฆฝ๋๋ค."
"ใฎใซใฌใกใใทใฅ่จไผๆฆ", # "๊ธธ๊ฐ๋ฉ์ฌ ํ ๋ฒ์ "
"ใฎใซใฌใกใใทใฅ่จไผๆฆใซ่กใฃใฆใใพใใไธ็ทใซ่กใใพใใใใ๏ผ", # "๊ธธ๊ฐ๋ฉ์ฌ ํ ๋ฒ์ ์ ๊ฐ๋๋ค. ๊ฐ์ด ๊ฐ์ค๋์?"
"ๅคใซใชใใพใใ", # "๋ฐค์ด ๋์์ต๋๋ค"
"ใ้ฃฏใ้ฃในใพใใใ." # "์, ์ด์ ์์ฌ๋ ํด๋ณผ๊น์"
]
def translate(text_src):
embeddings = src_tokenizer(text_src, return_attention_mask=False, return_token_type_ids=False, return_tensors='pt')
print(f'Src tokens: {embeddings.data["input_ids"]}')
embeddings = {k: v for k, v in embeddings.items()}
output = model.generate(**embeddings, max_length=500)[0, 1:-1]
print(f'Trg tokens: {output}')
text_trg = trg_tokenizer.decode(output.cpu())
return text_trg
for text in texts:
print(translate(text))
print()
Check the training.ipynb.