File size: 2,508 Bytes
d75114e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa3e0a7
d75114e
 
 
df5793e
d75114e
1705ee7
d75114e
 
 
 
 
 
 
 
 
 
 
 
b1fef81
 
d75114e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1705ee7
 
 
 
 
d75114e
05c3737
3d8adc6
f17e44d
d75114e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
from gradio.mix import Parallel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os
from transformers import T5TokenizerFast, T5ForConditionalGeneration
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
import itertools
import random
import nltk
from nltk.tokenize import sent_tokenize
import requests
import json
nltk.download('punkt')
from fastT5 import export_and_get_onnx_model


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

T5_tokenizer = AutoTokenizer.from_pretrained("jaimin/T5-Large")
T5_model = export_and_get_onnx_model('jaimin/T5-Large')



def get_paraphrases(text, n_predictions=3, top_k=50, max_length=256, device="cpu"):
    para = []
    sentence = text
    for sent in sent_tokenize(sentence):
      text = "paraphrase: "+sent + " </s>"
      encoding = T5_tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True)
      input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
      model_output = T5_model.generate(
                                    input_ids=input_ids,attention_mask=attention_masks,
                                    max_length = 512,
                                    early_stopping=True,
                                    num_beams=15,
                                    num_beam_groups = 3,
                                    num_return_sequences=n_predictions,
                                    diversity_penalty = 0.70,
                                    temperature=0.7,
                                    no_repeat_ngram_size=2)
      outputs = []
      for output in model_output:
        generated_sent = T5_tokenizer.decode(
            output, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        if (
                generated_sent.lower() != sentence.lower()
                and generated_sent not in outputs
        ):
            outputs.append(generated_sent.replace('paraphrasedoutput:', ""))
      para.append(outputs)
    print(para)
    a = list(itertools.product(*para))
    random.shuffle(a)

    l=[]
    for i in range(len(a)):
      l.append(" ".join(a[i]))
    final_output=[]
    for i in range(len(l)):
      final_output.append("* " + l[i] + ".")
    paraphrase = "\n".join(final_output)
    return paraphrase
    
    
iface = gr.Interface(fn=get_paraphrases, inputs=[gr.inputs.Textbox(lines=5)],outputs="text")
iface.launch()