Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import gradio as gr | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| nltk.download('punkt') | |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
| from transformers import PegasusForConditionalGeneration, PegasusTokenizer | |
| import torch | |
| def read_in_text(url): | |
| with open(url, 'r') as file: | |
| article = file.read() | |
| return article | |
| def clean_text(url): | |
| text = url | |
| #converting the text to all lower case | |
| text = text.lower() | |
| #removing the dates, time and name of author | |
| text = re.sub('(by[\s\w,|]+ - \d\d\/\d\d\/\d\d\s\d+:\d+\s\w{2}\s\w{2})|(by[\s\w|,]+\d\d,\s\d{4})', "", text) | |
| return text | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| print ("device ",device) | |
| T5_model = AutoModelForSeq2SeqLM.from_pretrained("jaimin/T5-Large") | |
| T5_tokenizer = AutoTokenizer.from_pretrained("jaimin/T5-Large") | |
| pegasus_model = PegasusForConditionalGeneration.from_pretrained('jaimin/pegasus').to(device) | |
| pegasus_tokenizer = PegasusTokenizer.from_pretrained('jaimin/pegasus') | |
| # Diverse Beam search | |
| def my_paraphrase(sentence, model, tokenizer,beams): | |
| text = "paraphrase: "+sentence + " </s>" | |
| encoding = tokenizer.encode_plus(text, padding=True, return_tensors="pt", truncation=True) | |
| input_ids,attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device) | |
| model.eval() | |
| diverse_beam_outputs = model.generate( | |
| input_ids=input_ids,attention_mask=attention_mask, | |
| max_length = 512, | |
| early_stopping=True, | |
| num_beams=beams, | |
| num_beam_groups = 5, | |
| num_return_sequences=5, | |
| diversity_penalty = 0.70 | |
| ) | |
| sent = tokenizer.decode(diverse_beam_outputs[0], skip_special_tokens=True,clean_up_tokenization_spaces=True) | |
| return sent | |
| def return_output(file, models,beams): | |
| docs = file | |
| sentence = clean_text(docs) | |
| if models == 'T5': | |
| model = T5_model | |
| tokenizer = T5_tokenizer | |
| elif models == 'Pegasus': | |
| model = pegasus_model | |
| tokenizer = pegasus_tokenizer | |
| output = " ".join([my_paraphrase(sent, model, tokenizer,beams) for sent in sent_tokenize(sentence)]) | |
| new_output = output.replace('paraphrasedoutput:', "") | |
| new_output = new_output.replace('.<n>', '.\n') | |
| return new_output | |
| demo = gr.Interface(return_output, inputs=[gr.inputs.Textbox(label="Text", optional=False), | |
| gr.inputs.Dropdown(['Pegasus', 'T5'], type="value", default=None, label="Models", optional=False), | |
| gr.Slider(label="Number of Beams", minimum=5, maximum=25, step=5, randomize=True, type="value", default=5, optional=False)], | |
| outputs=[gr.outputs.Textbox(label="Summary")]) | |
| if __name__ == "__main__": | |
| demo.launch() |