p commited on
Commit
4dffb82
·
1 Parent(s): 6a0e785

max_new_tokens

Browse files
Files changed (1) hide show
  1. app.py +146 -129
app.py CHANGED
@@ -10,7 +10,8 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
10
 
11
 
12
  this_description = '''
13
- Using facebook/m2m100-12B-avg-5-ckpt pre-trained model. Language code:
 
14
  Chinese(zh)
15
  English(en)
16
  Hindi(hi)
@@ -23,106 +24,106 @@ Vietnamese(vi)
23
 
24
  # From facebook/m2m100-12B-avg-5-ckpt
25
  lang_codes = {
26
- "Afrikaans": "af",
27
- "Amharic": "am",
28
- "Arabic": "ar",
29
- "Asturian": "ast",
30
- "Azerbaijani": "az",
31
- "Bashkir": "ba",
32
- "Belarusian": "be",
33
- "Bulgarian": "bg",
34
- "Bengali": "bn",
35
- "Breton": "br",
36
- "Bosnian": "bs",
37
- "Catalan; Valencian": "ca",
38
- "Cebuano": "ceb",
39
- "Czech": "cs",
40
- "Welsh": "cy",
41
- "Danish": "da",
42
- "German": "de",
43
- "Greeek": "el",
44
- "English": "en",
45
- "Spanish": "es",
46
- "Estonian": "et",
47
- "Persian": "fa",
48
- "Fulah": "ff",
49
- "Finnish": "fi",
50
- "French": "fr",
51
- "Western Frisian": "fy",
52
- "Irish": "ga",
53
- "Gaelic; Scottish Gaelic": "gd",
54
- "Galician": "gl",
55
- "Gujarati": "gu",
56
- "Hausa": "ha",
57
- "Hebrew": "he",
58
- "Hindi": "hi",
59
- "Croatian": "hr",
60
- "Haitian; Haitian Creole": "ht",
61
- "Hungarian": "hu",
62
- "Armenian": "hy",
63
- "Indonesian": "id",
64
- "Igbo": "ig",
65
- "Iloko": "ilo",
66
- "Icelandic": "is",
67
- "Italian": "it",
68
- "Japanese": "ja",
69
- "Javanese": "jv",
70
- "Georgian": "ka",
71
- "Kazakh": "kk",
72
- "Central Khmer": "km",
73
- "Kannada": "kn",
74
- "Korean": "ko",
75
- "Luxembourgish; Letzeburgesch": "lb",
76
- "Ganda": "lg",
77
- "Lingala": "ln",
78
- "Lao": "lo",
79
- "Lithuanian": "lt",
80
- "Latvian": "lv",
81
- "Malagasy": "mg",
82
- "Macedonian": "mk",
83
- "Malayalam": "ml",
84
- "Mongolian": "mn",
85
- "Marathi": "mr",
86
- "Malay": "ms",
87
- "Burmese": "my",
88
- "Nepali": "ne",
89
- "Dutch; Flemish": "nl",
90
- "Norwegian": "no",
91
- "Northern Sotho": "ns",
92
- "Occitan": "oc",
93
- "Oriya": "or",
94
- "Panjabi; Punjabi": "pa",
95
- "Polish": "pl",
96
- "Pushto": "ps",
97
- "Portuguese": "pt",
98
- "Romanian; Moldavian; Moldovan": "ro",
99
- "Russian": "ru",
100
- "Sindhi": "sd",
101
- "Sinhala; Sinhalese": "si",
102
- "Slovak": "sk",
103
- "Slovenian": "sl",
104
- "Somali": "so",
105
- "Albanian": "sq",
106
- "Serbian": "sr",
107
- "Swati": "ss",
108
- "Sundanese": "su",
109
- "Swedish": "sv",
110
- "Swahili": "sw",
111
- "Tamil": "ta",
112
- "Thai": "th",
113
- "Tagalog": "tl",
114
- "Tswana": "tn",
115
- "Turkish": "tr",
116
- "Ukrainian": "uk",
117
- "Urdu": "ur",
118
- "Uzbek": "uz",
119
- "Vietnamese": "vi",
120
- "Wolof": "wo",
121
- "Xhosa": "xh",
122
- "Yiddish": "yi",
123
- "Yoruba": "yo",
124
- "Chinese": "zh",
125
- "Zulu": "zu"
126
  }
127
 
128
 
@@ -131,6 +132,22 @@ def m2m_translate(Input_Text, from_lang, to_lang):
131
 
132
  encoded_from_lang = tokenizer(Input_Text, return_tensors="pt")
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  generated_tokens = model.generate(
135
  **encoded_from_lang, forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang]))
136
 
@@ -151,34 +168,34 @@ iface = gr.Interface(
151
  gr.Textbox(lines=5, placeholder="Enter text", label="Text input"),
152
 
153
  gr.Radio(
154
- choices=[
155
- 'Burmese',
156
- 'Chinese',
157
- 'English',
158
- 'Hindi',
159
- 'Japanese',
160
- 'Sinhala',
161
- 'Thai',
162
- 'Vietnamese'
163
- ],
164
- value='Vietnamese',
165
- label='From language'
166
- ),
167
-
168
- gr.Radio(
169
- choices=[
170
- 'Burmese',
171
- 'Chinese',
172
- 'English',
173
- 'Hindi',
174
- 'Japanese',
175
- 'Sinhala',
176
- 'Thai',
177
- 'Vietnamese'
178
- ],
179
- value='English',
180
- label='To language'
181
- ),
182
  ],
183
  outputs="text")
184
 
 
10
 
11
 
12
  this_description = '''
13
+ Using facebook/m2m100-12B-avg-5-ckpt pre-trained model. Some language code:
14
+
15
  Chinese(zh)
16
  English(en)
17
  Hindi(hi)
 
24
 
25
  # From facebook/m2m100-12B-avg-5-ckpt
26
  lang_codes = {
27
+ "Afrikaans": "af",
28
+ "Amharic": "am",
29
+ "Arabic": "ar",
30
+ "Asturian": "ast",
31
+ "Azerbaijani": "az",
32
+ "Bashkir": "ba",
33
+ "Belarusian": "be",
34
+ "Bulgarian": "bg",
35
+ "Bengali": "bn",
36
+ "Breton": "br",
37
+ "Bosnian": "bs",
38
+ "Catalan; Valencian": "ca",
39
+ "Cebuano": "ceb",
40
+ "Czech": "cs",
41
+ "Welsh": "cy",
42
+ "Danish": "da",
43
+ "German": "de",
44
+ "Greeek": "el",
45
+ "English": "en",
46
+ "Spanish": "es",
47
+ "Estonian": "et",
48
+ "Persian": "fa",
49
+ "Fulah": "ff",
50
+ "Finnish": "fi",
51
+ "French": "fr",
52
+ "Western Frisian": "fy",
53
+ "Irish": "ga",
54
+ "Gaelic; Scottish Gaelic": "gd",
55
+ "Galician": "gl",
56
+ "Gujarati": "gu",
57
+ "Hausa": "ha",
58
+ "Hebrew": "he",
59
+ "Hindi": "hi",
60
+ "Croatian": "hr",
61
+ "Haitian; Haitian Creole": "ht",
62
+ "Hungarian": "hu",
63
+ "Armenian": "hy",
64
+ "Indonesian": "id",
65
+ "Igbo": "ig",
66
+ "Iloko": "ilo",
67
+ "Icelandic": "is",
68
+ "Italian": "it",
69
+ "Japanese": "ja",
70
+ "Javanese": "jv",
71
+ "Georgian": "ka",
72
+ "Kazakh": "kk",
73
+ "Central Khmer": "km",
74
+ "Kannada": "kn",
75
+ "Korean": "ko",
76
+ "Luxembourgish; Letzeburgesch": "lb",
77
+ "Ganda": "lg",
78
+ "Lingala": "ln",
79
+ "Lao": "lo",
80
+ "Lithuanian": "lt",
81
+ "Latvian": "lv",
82
+ "Malagasy": "mg",
83
+ "Macedonian": "mk",
84
+ "Malayalam": "ml",
85
+ "Mongolian": "mn",
86
+ "Marathi": "mr",
87
+ "Malay": "ms",
88
+ "Burmese": "my",
89
+ "Nepali": "ne",
90
+ "Dutch; Flemish": "nl",
91
+ "Norwegian": "no",
92
+ "Northern Sotho": "ns",
93
+ "Occitan": "oc",
94
+ "Oriya": "or",
95
+ "Panjabi; Punjabi": "pa",
96
+ "Polish": "pl",
97
+ "Pushto": "ps",
98
+ "Portuguese": "pt",
99
+ "Romanian; Moldavian; Moldovan": "ro",
100
+ "Russian": "ru",
101
+ "Sindhi": "sd",
102
+ "Sinhala; Sinhalese": "si",
103
+ "Slovak": "sk",
104
+ "Slovenian": "sl",
105
+ "Somali": "so",
106
+ "Albanian": "sq",
107
+ "Serbian": "sr",
108
+ "Swati": "ss",
109
+ "Sundanese": "su",
110
+ "Swedish": "sv",
111
+ "Swahili": "sw",
112
+ "Tamil": "ta",
113
+ "Thai": "th",
114
+ "Tagalog": "tl",
115
+ "Tswana": "tn",
116
+ "Turkish": "tr",
117
+ "Ukrainian": "uk",
118
+ "Urdu": "ur",
119
+ "Uzbek": "uz",
120
+ "Vietnamese": "vi",
121
+ "Wolof": "wo",
122
+ "Xhosa": "xh",
123
+ "Yiddish": "yi",
124
+ "Yoruba": "yo",
125
+ "Chinese": "zh",
126
+ "Zulu": "zu"
127
  }
128
 
129
 
 
132
 
133
  encoded_from_lang = tokenizer(Input_Text, return_tensors="pt")
134
 
135
+ generated_tokens = model.generate(
136
+ **encoded_from_lang,
137
+ max_new_tokens=200,
138
+ forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang])
139
+ )
140
+
141
+ res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
142
+
143
+ return res[0]
144
+
145
+
146
+ def m2m_translate2(Input_Text, from_lang, to_lang):
147
+ tokenizer.src_lang = lang_codes[from_lang]
148
+
149
+ encoded_from_lang = tokenizer(Input_Text, return_tensors="pt")
150
+
151
  generated_tokens = model.generate(
152
  **encoded_from_lang, forced_bos_token_id=tokenizer.get_lang_id(lang_codes[to_lang]))
153
 
 
168
  gr.Textbox(lines=5, placeholder="Enter text", label="Text input"),
169
 
170
  gr.Radio(
171
+ choices=[
172
+ 'Burmese',
173
+ 'Chinese',
174
+ 'English',
175
+ 'Hindi',
176
+ 'Japanese',
177
+ 'Sinhala',
178
+ 'Thai',
179
+ 'Vietnamese'
180
+ ],
181
+ value='Vietnamese',
182
+ label='From language'
183
+ ),
184
+
185
+ gr.Radio(
186
+ choices=[
187
+ 'Burmese',
188
+ 'Chinese',
189
+ 'English',
190
+ 'Hindi',
191
+ 'Japanese',
192
+ 'Sinhala',
193
+ 'Thai',
194
+ 'Vietnamese'
195
+ ],
196
+ value='English',
197
+ label='To language'
198
+ ),
199
  ],
200
  outputs="text")
201