translation_fr-als / functions_model.py
JoanneAB's picture
Upload functions_model.py with huggingface_hub
c75118d verified
#!/usr/bin/env python3
from transformers import pipeline
# --------------------------------------------------------------------------------------------------
# from huggingface_example_translation.ipynb
def encode(examples, tokenizer, max_input_length=128, max_target_length=128):
"""
Function to truncate the inputs
"""
# Prefix required by the T5 checkpoints:
prefix = "Translate French to Alsacien: "
inputs = [prefix + ex[tokenizer.src_lang] for ex in examples["translation"]] # If 'Datasets' format
targets = [ex[tokenizer.tgt_lang] for ex in examples["translation"]] # If 'Datasets' format
# inputs = [prefix + ex[tokenizer.src_lang] for ex in examples]
# targets = [ex[tokenizer.tgt_lang] for ex in examples]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# --------------------------------------------------------------------------------------------------
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
# --------------------------------------------------------------------------------------------------
def do_translation(text, model_name, return_text=True, return_token=False):
"""
Do the translation using the trained model.
return either the translation or the tokens
"""
# make the text a correct format for translation :
translator = pipeline("translation_XX_to_YY", model=model_name)
translator(text)
# Tokenize the text: text -> tokens :
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt").input_ids
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Do the translation using the tokenized text:
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
# Token -> text:
translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
if return_text:
return translation
if return_tokens:
return output[0]