Thore Andresen commited on
Commit
a986df4
·
1 Parent(s): 0f36f28

Adapt code for version 3.0 of nllb-deu-moo

Browse files
Files changed (2) hide show
  1. inference.py +2 -31
  2. requirements.txt +1 -1
inference.py CHANGED
@@ -1,39 +1,10 @@
1
  from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
2
 
3
 
4
- def create_tokenizer_with_new_lang(model_id, new_lang):
5
- """
6
- Add a new language token to the tokenizer vocabulary
7
- (this should be done each time after its initialization)
8
- """
9
- tokenizer = NllbTokenizer.from_pretrained(model_id)
10
- old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
11
- tokenizer.lang_code_to_id[new_lang] = old_len-1
12
- tokenizer.id_to_lang_code[old_len-1] = new_lang
13
- # always move "mask" to the last position
14
- tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
15
-
16
- tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
17
- tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
18
- if new_lang not in tokenizer._additional_special_tokens:
19
- tokenizer._additional_special_tokens.append(new_lang)
20
- # clear the added token encoder; otherwise a new token may end up there by mistake
21
- tokenizer.added_tokens_encoder = {}
22
- tokenizer.added_tokens_decoder = {}
23
-
24
- return tokenizer
25
-
26
-
27
  class Translator:
28
  @classmethod
29
- def from_pretrained(cls, path, new_lang='moo_Latn'):
30
- # Does the model need adaptation or not?
31
- # model, tokenizer = create_model_with_new_lang(
32
- # model_id=path,
33
- # new_lang=new_lang,
34
- # similar_lang='deu_Latn'
35
- # )
36
- tokenizer = create_tokenizer_with_new_lang(path, new_lang)
37
  model = AutoModelForSeq2SeqLM.from_pretrained(path)
38
  return Translator(model, tokenizer)
39
 
 
1
  from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
2
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  class Translator:
5
  @classmethod
6
+ def from_pretrained(cls, path):
7
+ tokenizer = NllbTokenizer.from_pretrained(path)
 
 
 
 
 
 
8
  model = AutoModelForSeq2SeqLM.from_pretrained(path)
9
  return Translator(model, tokenizer)
10
 
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- transformers==4.33
2
  sentencepiece>=0.1.99
3
  torch>=2.1.2
 
1
+ transformers>=4.46
2
  sentencepiece>=0.1.99
3
  torch>=2.1.2