Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,12 +2,13 @@ import gradio as gr
|
|
| 2 |
import spaces
|
| 3 |
import torch
|
| 4 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
|
|
|
| 5 |
|
| 6 |
-
favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "
|
| 7 |
-
|
| 8 |
-
langs = {**favourite_langs, **all_langs}
|
| 9 |
|
| 10 |
-
options = list(langs.keys())
|
|
|
|
| 11 |
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/mbart-large-50-many-to-many-mmt"]
|
| 12 |
|
| 13 |
def model_to_cuda(model):
|
|
@@ -37,24 +38,22 @@ def translate_text(input_text, sselected_language, tselected_language, model_nam
|
|
| 37 |
return f"Error finding model: {model_name}! Try other available language combination.", error
|
| 38 |
|
| 39 |
if model_name.startswith('facebook/nllb'):
|
| 40 |
-
|
| 41 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=nllb_language_codes[sselected_language])
|
| 42 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
|
| 43 |
-
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=nllb_language_codes[sselected_language], tgt_lang=nllb_language_codes[tselected_language])
|
| 44 |
translated_text = translator(input_text, max_length=512)
|
| 45 |
return translated_text[0]['translation_text'], message_text
|
| 46 |
|
| 47 |
-
if model_name.startswith('facebook/mbart-large
|
| 48 |
-
from languagecodes import mbart_large_languages
|
| 49 |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
|
| 50 |
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
| 51 |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
| 52 |
# translate source to target
|
| 53 |
-
tokenizer.src_lang = mbart_large_languages[sselected_language]
|
| 54 |
encoded = tokenizer(input_text, return_tensors="pt")
|
| 55 |
generated_tokens = model.generate(
|
| 56 |
**encoded,
|
| 57 |
-
forced_bos_token_id=tokenizer.lang_code_to_id[mbart_large_languages[tselected_language]]
|
| 58 |
)
|
| 59 |
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0], message_text
|
| 60 |
|
|
|
|
| 2 |
import spaces
|
| 3 |
import torch
|
| 4 |
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
| 5 |
+
import languagecodes
|
| 6 |
|
| 7 |
+
favourite_langs = {"German": "de", "Romanian": "ro", "English": "en", "-----": "-----"}
|
| 8 |
+
langs = languagecodes.iso_languages
|
|
|
|
| 9 |
|
| 10 |
+
# options = list(langs.keys())
|
| 11 |
+
options = [(k, v) for k,v in favourite_langs.items()].extend([(k, v) for k,v in langs.items()])
|
| 12 |
models = ["Helsinki-NLP", "t5-base", "t5-small", "t5-large", "facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/mbart-large-50-many-to-many-mmt"]
|
| 13 |
|
| 14 |
def model_to_cuda(model):
|
|
|
|
| 38 |
return f"Error finding model: {model_name}! Try other available language combination.", error
|
| 39 |
|
| 40 |
if model_name.startswith('facebook/nllb'):
|
| 41 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang=languagecodes.nllb_language_codes[sselected_language])
|
|
|
|
| 42 |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
|
| 43 |
+
translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=languagecodes.nllb_language_codes[sselected_language], tgt_lang=languagecodes.nllb_language_codes[tselected_language])
|
| 44 |
translated_text = translator(input_text, max_length=512)
|
| 45 |
return translated_text[0]['translation_text'], message_text
|
| 46 |
|
| 47 |
+
if model_name.startswith('facebook/mbart-large
|
|
|
|
| 48 |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
|
| 49 |
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
| 50 |
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
|
| 51 |
# translate source to target
|
| 52 |
+
tokenizer.src_lang = languagecodes.mbart_large_languages[sselected_language]
|
| 53 |
encoded = tokenizer(input_text, return_tensors="pt")
|
| 54 |
generated_tokens = model.generate(
|
| 55 |
**encoded,
|
| 56 |
+
forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[tselected_language]]
|
| 57 |
)
|
| 58 |
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0], message_text
|
| 59 |
|