Spaces:
Runtime error
Runtime error
File size: 4,914 Bytes
9c30e19 e48c2f7 9c30e19 4aebf90 61c14f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
#importing the necessary library
import re
import nltk
import spacy
import math
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
# Defining a function to read in the text file
def read_in_text(url):
with open(url, 'r') as file:
article = file.read()
return article
def clean_text(url):
text = url
text = text.encode("ascii", errors="ignore").decode(
"ascii"
) # remove non-ascii, Chinese characters
text = re.sub(r"\n", " ", text)
text = re.sub(r"\n\n", " ", text)
text = re.sub(r"\t", " ", text)
text = text.strip(" ")
text = re.sub(
" +", " ", text
).strip() # get rid of multiple spaces and replace with a single
return text
#initailizing the model pipeline
from transformers import BartTokenizer, BartForConditionalGeneration
model = BartForConditionalGeneration.from_pretrained("jaimin/distilbart-summarizer")
tokenizer = BartTokenizer.from_pretrained("jaimin/distilbart-summarizer")
nlp = spacy.load("en_core_web_sm")
#Defining a function to get the summary of the article
def final_summary(file):
#reading in the text and tokenizing it into sentence
text = clean_text(file)
bullet_points = 10
while (bullet_points >= 10):
chunks = []
sentences = nlp(text)
for sentence in sentences.sents:
chunks.append(str(sentence))
output = []
sentences_remaining = len(chunks)
i = 0
#looping through the sentences in an equal batch based on their length and summarizing them
while sentences_remaining > 0:
chunks_remaining = math.ceil(sentences_remaining / 10.0)
next_chunk_size = math.ceil(sentences_remaining / chunks_remaining)
sentence = "".join(chunks[i:i+next_chunk_size])
i += next_chunk_size
sentences_remaining -= next_chunk_size
inputs = tokenizer(sentence, return_tensors="pt", padding="longest")
#inputs = inputs.to(DEVICE)
original_input_length = len(inputs["input_ids"][0])
# checking if the length of the input batch is less than 150
if original_input_length < 100:
split_sentences = nlp(sentence)
for split_sentence in split_sentences.sents:
output.append(str(split_sentence).rstrip("."))
# checking if the length of the input batch is greater than 1024
elif original_input_length > 1024:
sent = sent_tokenize(sentence)
length_sent = len(sent)
j = 0
sent_remaining = math.ceil(length_sent / 2)
# going through the batch that is greater than 1024 and dividing them
while length_sent > 0:
halved_sentence = "".join(sent[j:j+sent_remaining])
halved_inputs = tokenizer(halved_sentence, return_tensors="pt")
#halved_inputs = halved_inputs.to(DEVICE)
halved_summary_ids = model.generate(halved_inputs["input_ids"])
j += sent_remaining
length_sent -= sent_remaining
# checking if the length of the output summary is less than the original text
if len(halved_summary_ids[0]) < len(halved_inputs["input_ids"][0]):
halved_summary = tokenizer.batch_decode(halved_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
output.append(halved_summary)
else:
summary_ids = model.generate(inputs["input_ids"])
if len(summary_ids[0]) < original_input_length:
summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
output.append(summary)
final_output = []
for paragraphs in output:
lines = paragraphs.split(" . ")
for line in lines:
final_output.append(line.replace(" .", "").strip())
text = ".".join(final_output)
bullet_points = len(final_output)
for i in range(len(final_output)):
final_output[i] = "* " + final_output[i] + "."
# final sentences are incoherent, so we will join them by bullet separator
summary_bullet = "\n".join(final_output)
return summary_bullet
#creating an interface for the headline generator using gradio
demo = gr.Interface(final_summary, inputs=[gr.inputs.Textbox(label="Drop your article here", optional=False)],
title = "ARTICLE SUMMARIZER",
outputs=[gr.outputs.Textbox(label="Summary")],
theme= "darkhuggingface")
demo.launch() |