Spaces:
Runtime error
Runtime error
| from sumy.parsers.html import HtmlParser | |
| from sumy.parsers.plaintext import PlaintextParser | |
| from sumy.nlp.tokenizers import Tokenizer | |
| from sumy.summarizers.lex_rank import LexRankSummarizer | |
| from sumy.nlp.stemmers import Stemmer | |
| from sumy.utils import get_stop_words | |
| import metrics | |
| import os | |
| import nltk | |
| def summarize(in_text): | |
| if len(in_text)==0: | |
| return 'Error: No text provided', None | |
| nltk_file = '/home/user/nltk_data/tokenizers/punkt.zip' | |
| if os.path.exists(nltk_file): | |
| print('nltk punkt file exists in ', nltk_file) | |
| else: | |
| print("downloading punkt file") | |
| nltk.download('punkt') | |
| in_longtext = [] | |
| # Discard all senteces that have less than 10 words in them | |
| in_text_sentenses = in_text.split('.') | |
| for sen in in_text_sentenses: | |
| sen_split = sen.split() | |
| len_sen_split = len(sen_split) | |
| if len_sen_split > 10: | |
| in_longtext.append(sen) | |
| in_text = '.'.join(in_longtext)+'.' | |
| # The size of the summary is limited to 1024 | |
| # The Lexrank algorith accepts only sentences as a limit | |
| # We start with one sentece and check the token size | |
| # Then increase the number of sentences until the tokensize | |
| # of the next sentence exceed the limit | |
| target_tokens = 1024 | |
| in_sents = metrics.num_sentences(in_text) | |
| out_text = get_Summary(in_text,1) | |
| n_tokens= metrics.num_tokens(out_text) | |
| prev_n_tokens=0 | |
| for sen in range(2, in_sents): | |
| if n_tokens >= target_tokens: | |
| n_tokens = prev_n_tokens | |
| break | |
| else: | |
| out_text = get_Summary(in_text,sen) | |
| prev_n_tokens = n_tokens | |
| n_tokens= metrics.num_tokens(out_text) | |
| n_sents = metrics.num_sentences(out_text) | |
| n_words = metrics.num_words(out_text) | |
| n_chars = metrics.num_chars(out_text) | |
| return out_text, n_words, n_sents, n_chars, n_tokens | |
| def get_Summary(in_text, nr_sentences): | |
| #sentences = in_text.split('. ') | |
| # summarize small part of the text | |
| #nr_sentences = 1 #len(sentences) | |
| #print('nr_sentences: '+str(nr_sentences)) | |
| if nr_sentences == 0: | |
| return 'Error: No sentences available', None | |
| list_summary = get_Lexrank(in_text,nr_sentences) | |
| # it can happen that for lexrank a sentence consists of multiple actual sentences, | |
| # that are separated with full stops. Then the correspoinding timestamp cannot be found | |
| # all items from the lexrank summary must be concatinated and split up by full stops. | |
| concat_list_summary = '. '.join([str(item).replace('.','') for item in list_summary])#.split('. ') | |
| concat_list_summary = concat_list_summary.replace('\\n','') | |
| concat_list_summary = concat_list_summary.replace('. ','.\n')+'.' | |
| return concat_list_summary | |
| def get_Lexrank(text, nr_sentences): | |
| summary=[] | |
| LANGUAGE = "english" | |
| SENTENCES_COUNT = nr_sentences | |
| parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) | |
| stemmer = Stemmer(LANGUAGE) | |
| summarizer = LexRankSummarizer(stemmer) | |
| summarizer.stop_words = get_stop_words(LANGUAGE) | |
| for sentence in summarizer(parser.document, SENTENCES_COUNT): | |
| summary.append(sentence) | |
| return summary | |