| |
|
| |
|
| | from transformers import pipeline
|
| |
|
| |
|
| |
|
| | def encode(examples, tokenizer, max_input_length=128, max_target_length=128):
|
| | """
|
| | Function to truncate the inputs
|
| | """
|
| |
|
| | prefix = "Translate French to Alsacien: "
|
| |
|
| | inputs = [prefix + ex[tokenizer.src_lang] for ex in examples["translation"]]
|
| | targets = [ex[tokenizer.tgt_lang] for ex in examples["translation"]]
|
| |
|
| |
|
| | model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
|
| |
|
| |
|
| | with tokenizer.as_target_tokenizer():
|
| | labels = tokenizer(targets, max_length=max_target_length, truncation=True)
|
| |
|
| | model_inputs["labels"] = labels["input_ids"]
|
| | return model_inputs
|
| |
|
| |
|
| | def compute_metrics(eval_preds):
|
| | preds, labels = eval_preds
|
| | if isinstance(preds, tuple):
|
| | preds = preds[0]
|
| | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
| |
|
| |
|
| | labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
| | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
| |
|
| |
|
| | decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
|
| |
|
| | result = metric.compute(predictions=decoded_preds, references=decoded_labels)
|
| | result = {"bleu": result["score"]}
|
| |
|
| | prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
|
| | result["gen_len"] = np.mean(prediction_lens)
|
| | result = {k: round(v, 4) for k, v in result.items()}
|
| | return result
|
| |
|
| |
|
| | def do_translation(text, model_name, return_text=True, return_token=False):
|
| | """
|
| | Do the translation using the trained model.
|
| | return either the translation or the tokens
|
| | """
|
| |
|
| | translator = pipeline("translation_XX_to_YY", model=model_name)
|
| | translator(text)
|
| |
|
| |
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| | inputs = tokenizer(text, return_tensors="pt").input_ids
|
| |
|
| | model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
| |
|
| | outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
|
| |
|
| |
|
| | translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| |
|
| | if return_text:
|
| | return translation
|
| | if return_tokens:
|
| | return output[0]
|
| |
|