Spaces:
Build error
Build error
| import re | |
| def get_sentences(txt): | |
| return txt.split('.') | |
| def get_words(txt): | |
| only_words_text = re.compile(r'[^0-9^a-z^A-Z\s]').sub('',txt) | |
| return only_words_text.split(' ') | |
| def get_keywords(word_list , min_ratio=0.001, max_ratio=0.5) : | |
| """ this method takes a word list and returns a set of keywords """ | |
| assert (min_ratio < 1 and max_ratio < 1) | |
| count_dict = {} | |
| for word in word_list: | |
| count_dict.setdefault(word , 0) | |
| count_dict[word] +=1 | |
| keywords = set() | |
| for word , cnt in count_dict.items(): | |
| word_percentage = count_dict[word]* 1.0 / len (word_list) | |
| if word_percentage <= max_ratio and word_percentage >=min_ratio: | |
| keywords.add(word) | |
| return keywords | |
| def get_sentence_weight (sentence , keywords): | |
| """ this method takes a sentence string and a set of keywords and returns weight of the sentence """ | |
| sen_list = sentence.split(' ') | |
| window_start = 0; window_end = -1; | |
| #calculating window start | |
| for i in range(len(sen_list)): | |
| if sen_list[i] in keywords: | |
| window_start = i | |
| break | |
| #calculating window end | |
| for i in range(len(sen_list) - 1 , 0 , -1) : | |
| if sen_list[i] in keywords: | |
| window_end = i | |
| break | |
| if window_start > window_end : | |
| return 0 | |
| window_size = window_end - window_start + 1 | |
| #calculating number of keywords | |
| keywords_cnt =0 | |
| for w in sen_list : | |
| if w in keywords: | |
| keywords_cnt +=1 | |
| return keywords_cnt*keywords_cnt *1.0 / window_size | |
| def summarize(text): | |
| txt = text.replace('\n','') | |
| word_list = get_words(txt) | |
| keywords = get_keywords(word_list , 0.05 , 0.5) | |
| sentence_list = get_sentences(txt) | |
| sentence_weight = {} | |
| for sen in sentence_list: | |
| sentence_weight[sen] = get_sentence_weight(sen, keywords) | |
| top_sentences = list(sentence_list) # make a copy | |
| top_sentences.sort(key=lambda x: sentence_weight[x], reverse=True) # sort by score | |
| top_sentences = top_sentences[:int(len(sentence_weight)*0.2)] # get a part | |
| top_sentences.sort(key=lambda x: sentence_list.index(x)) # sort by occurrence | |
| summary = '. '.join(top_sentences) | |
| return summary |