Spaces:
Running
Running
Thore Andresen commited on
Commit ·
a986df4
1
Parent(s): 0f36f28
Adapt code for version 3.0 of nllb-deu-moo
Browse files- inference.py +2 -31
- requirements.txt +1 -1
inference.py
CHANGED
|
@@ -1,39 +1,10 @@
|
|
| 1 |
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
|
| 2 |
|
| 3 |
|
| 4 |
-
def create_tokenizer_with_new_lang(model_id, new_lang):
|
| 5 |
-
"""
|
| 6 |
-
Add a new language token to the tokenizer vocabulary
|
| 7 |
-
(this should be done each time after its initialization)
|
| 8 |
-
"""
|
| 9 |
-
tokenizer = NllbTokenizer.from_pretrained(model_id)
|
| 10 |
-
old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
|
| 11 |
-
tokenizer.lang_code_to_id[new_lang] = old_len-1
|
| 12 |
-
tokenizer.id_to_lang_code[old_len-1] = new_lang
|
| 13 |
-
# always move "mask" to the last position
|
| 14 |
-
tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
|
| 15 |
-
|
| 16 |
-
tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
|
| 17 |
-
tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
|
| 18 |
-
if new_lang not in tokenizer._additional_special_tokens:
|
| 19 |
-
tokenizer._additional_special_tokens.append(new_lang)
|
| 20 |
-
# clear the added token encoder; otherwise a new token may end up there by mistake
|
| 21 |
-
tokenizer.added_tokens_encoder = {}
|
| 22 |
-
tokenizer.added_tokens_decoder = {}
|
| 23 |
-
|
| 24 |
-
return tokenizer
|
| 25 |
-
|
| 26 |
-
|
| 27 |
class Translator:
|
| 28 |
@classmethod
|
| 29 |
-
def from_pretrained(cls, path
|
| 30 |
-
|
| 31 |
-
# model, tokenizer = create_model_with_new_lang(
|
| 32 |
-
# model_id=path,
|
| 33 |
-
# new_lang=new_lang,
|
| 34 |
-
# similar_lang='deu_Latn'
|
| 35 |
-
# )
|
| 36 |
-
tokenizer = create_tokenizer_with_new_lang(path, new_lang)
|
| 37 |
model = AutoModelForSeq2SeqLM.from_pretrained(path)
|
| 38 |
return Translator(model, tokenizer)
|
| 39 |
|
|
|
|
| 1 |
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
|
| 2 |
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
class Translator:
|
| 5 |
@classmethod
|
| 6 |
+
def from_pretrained(cls, path):
|
| 7 |
+
tokenizer = NllbTokenizer.from_pretrained(path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
model = AutoModelForSeq2SeqLM.from_pretrained(path)
|
| 9 |
return Translator(model, tokenizer)
|
| 10 |
|
requirements.txt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
transformers=
|
| 2 |
sentencepiece>=0.1.99
|
| 3 |
torch>=2.1.2
|
|
|
|
| 1 |
+
transformers>=4.46
|
| 2 |
sentencepiece>=0.1.99
|
| 3 |
torch>=2.1.2
|