File size: 3,305 Bytes
0b1f426 f7a7751 0b1f426 f7a7751 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
from transformers import T5Tokenizer, MT5ForConditionalGeneration
from simpletransformers.t5 import T5Model
import datetime
import logging
import os
class Inference:
def _discard_recommendations(self, original, proposal):
proposal = proposal.lower()
original = original.lower()
if proposal == original:
return True
chars = [".", "!", " ", "?", ","]
_proposal = proposal
_original = original
for char in chars:
proposal = proposal.replace(char, "")
original = original.replace(char, "")
if proposal == original:
return True
return False
# https://github.com/Vamsi995/Paraphrase-Generator/blob/master/evaluate.py
def get_paraphrases(
self,
model_name,
sentence,
temperature,
prefix="paraphrase: ",
n_predictions=2,
top_k=120,
max_length=256,
device="cpu",
):
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
discaded = 0
text = prefix + sentence + " </s>"
encoding = tokenizer.encode_plus(
text, pad_to_max_length=True, return_tensors="pt"
)
input_ids, attention_masks = encoding["input_ids"].to(device), encoding[
"attention_mask"
].to(device)
do_sample = True if temperature > 0 else False
print(f"do_sample: {do_sample}")
print(f"temperature: {temperature}")
# https://huggingface.co/blog/how-to-generate
# https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html
model_output = model.generate(
input_ids=input_ids,
attention_mask=attention_masks,
do_sample=do_sample,
max_length=max_length,
top_k=top_k,
num_beams=n_predictions * 2, ## ask for twice since some will be discarted
top_p=0.98,
temperature=temperature,
early_stopping=True,
num_return_sequences=n_predictions * 2,
)
logging.debug(f"{len(model_output)} predictions for {sentence}")
outputs = []
for output in model_output:
generated_sent = tokenizer.decode(
output, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
if (
self._discard_recommendations(sentence, generated_sent) is False
and generated_sent not in outputs
):
generated_sent = generated_sent.replace("’", "'")
outputs.append(generated_sent)
else:
logging.debug(f"Discarded: {generated_sent} - source:{sentence}")
discaded = +1
if len(outputs) == n_predictions:
break
return outputs
def main():
i = Inference()
sentence = "Aquesta és una associació sense ànim de lucre amb la missió de fomentar la presència i l'ús del català."
model = os.getcwd()
options = i.get_paraphrases(model, sentence, 1.0)
print(f"original: {sentence}")
for option in options:
print(f" {option}")
if __name__ == "__main__":
main()
|