pollitoconpapass commited on
Commit
93d35c0
·
verified ·
1 Parent(s): 5788c5a

Upload README file

Browse files
Files changed (1) hide show
  1. README.md +122 -0
README.md ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ datasets:
4
+ - pollitoconpapass/new-cuzco-quechua-translation-dataset
5
+ language:
6
+ - qu
7
+ base_model:
8
+ - facebook/nllb-200-distilled-600M
9
+ pipeline_tag: translation
10
+ ---
11
+ ## Overview
12
+ This model is a finetuning of [nllb-200-distilled-600M](https://huggingface.co/facebook/nllb-200-distilled-600M) to handle the Cuzco Quechua language.
13
+
14
+ ## Model Implementation
15
+ Use this script to test the model, change the respective values.
16
+ ```py
17
+ import time
18
+ from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
19
+
20
+
21
+ def fix_tokenizer(tokenizer, new_lang='quz_Latn'):
22
+ """
23
+ Add a new language token to the tokenizer vocabulary and update language mappings.
24
+ """
25
+ # First ensure we're working with an NLLB tokenizer
26
+ if not hasattr(tokenizer, 'sp_model'):
27
+ raise ValueError("This function expects an NLLB tokenizer")
28
+
29
+ # Add the new language token if it's not already present
30
+ if new_lang not in tokenizer.additional_special_tokens:
31
+ tokenizer.add_special_tokens({
32
+ 'additional_special_tokens': [new_lang]
33
+ })
34
+
35
+ # Initialize lang_code_to_id if it doesn't exist
36
+ if not hasattr(tokenizer, 'lang_code_to_id'):
37
+ tokenizer.lang_code_to_id = {}
38
+
39
+ # Add the new language to lang_code_to_id mapping
40
+ if new_lang not in tokenizer.lang_code_to_id:
41
+ # Get the ID for the new language token
42
+ new_lang_id = tokenizer.convert_tokens_to_ids(new_lang)
43
+ tokenizer.lang_code_to_id[new_lang] = new_lang_id
44
+
45
+ # Initialize id_to_lang_code if it doesn't exist
46
+ if not hasattr(tokenizer, 'id_to_lang_code'):
47
+ tokenizer.id_to_lang_code = {}
48
+
49
+ # Update the reverse mapping
50
+ tokenizer.id_to_lang_code[tokenizer.lang_code_to_id[new_lang]] = new_lang
51
+
52
+ return tokenizer
53
+
54
+
55
+ MODEL_URL = "pollitoconpapass/QnIA-translation-model"
56
+ model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_URL)
57
+ tokenizer = NllbTokenizer.from_pretrained(MODEL_URL)
58
+ fix_tokenizer(tokenizer)
59
+
60
+ def translate(text, src_lang='spa_Latn', tgt_lang='quz_Latn', a=32, b=3, max_input_length=1024, num_beams=4, **kwargs):
61
+ tokenizer.src_lang = src_lang
62
+ tokenizer.tgt_lang = tgt_lang
63
+ inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_input_length)
64
+ result = model.generate(
65
+ **inputs.to(model.device),
66
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
67
+ max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
68
+ num_beams=num_beams,
69
+ **kwargs
70
+ )
71
+ return tokenizer.batch_decode(result, skip_special_tokens=True)
72
+
73
+
74
+ def translate_v2(text, model, tokenizer, src_lang='spa_Latn', tgt_lang='quz_Latn',
75
+ max_length='auto', num_beams=4, no_repeat_ngram_size=4, n_out=None, **kwargs):
76
+
77
+ tokenizer.src_lang = src_lang
78
+ encoded = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
79
+ if max_length == 'auto':
80
+ max_length = int(32 + 2.0 * encoded.input_ids.shape[1])
81
+ model.eval()
82
+ generated_tokens = model.generate(
83
+ **encoded.to(model.device),
84
+ forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang],
85
+ max_length=max_length,
86
+ num_beams=num_beams,
87
+ no_repeat_ngram_size=no_repeat_ngram_size,
88
+ num_return_sequences=n_out or 1,
89
+ **kwargs
90
+ )
91
+ out = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
92
+ if isinstance(text, str) and n_out is None:
93
+ return out[0]
94
+ return out
95
+
96
+
97
+ # === MAIN ===
98
+ t = '''
99
+ Subes centelleante de labios y de ojeras!
100
+ Por tus venas subo, como un can herido
101
+ que busca el refugio de blandas aceras.
102
+
103
+ Amor, en el mundo tú eres un pecado!
104
+ Mi beso en la punta chispeante del cuerno
105
+ del diablo; mi beso que es credo sagrado!
106
+ '''
107
+
108
+ start = time.time()
109
+ result_v1 = translate(t, 'spa_Latn', 'quz_Latn')
110
+ print(f"\n{result_v1}")
111
+
112
+ end = time.time()
113
+ print(f"\nTime for method v1: {end - start}")
114
+
115
+
116
+ # start_v2 = time.time()
117
+ # result_v2 = translate_v2(t, model, tokenizer)
118
+ # print(result_v2)
119
+
120
+ # end_v2 = time.time()
121
+ # print(f"\nTime for method v1: {end_v2 - start_v2}")
122
+ ```