nepali-summarization / summarizer.py
sulavbcaa's picture
Upload Sulav TextRank summarization implementation
3061dc8 verified
import tokenizer as text_rank_tokenizer
import ranker as text_rank_ranker
import numpy as np
stop_words = open("./text_rank/stopwords.txt",'r',encoding="utf-8").read()
word_endings = open("./text_rank/word_endings.txt",'r',encoding='utf-8').read()
kriyapads = open("./text_rank/minimal_kriyapad.txt",'r',encoding="utf-8").read().split("\n")
samyojaks = open("./text_rank/samyojak.txt",'r',encoding="utf-8").read().split("\n")
valid_chars = "./text_rank/valid_chars.json"
def get_summary_from_text(text,force_use_purnabiram_model=False):
global stop_words, word_endings, kriyapads, samyojaks
is_complete_sentence = True
# if "।" not in text:
purnabiram_count = text.count("।")
if not force_use_purnabiram_model:
if purnabiram_count*100 < len(text):
is_complete_sentence = False
else:
is_complete_sentence = False
# print(is_complete_sentence)
valid_characters = text_rank_tokenizer.get_valid_chars(valid_chars)
if not is_complete_sentence:
text = text_rank_tokenizer.add_purnabiram(text,kriyapads,samyojaks)
#
# Split the sentence into array of words and patagraph in its array. (as Array of Array of the words)
#
sentences = text_rank_tokenizer.get_sentences_as_arr(text)
# print(sentences)
text = text_rank_tokenizer.remove_useless_characters(text,valid_characters)
sentences = text_rank_tokenizer.remove_repeating_sentences(sentences)
if len(sentences) == 0:
return "It is not a valid text. Please try again with a valid text."
elif len(sentences) == 1:
return sentences
# print(sentences)
words_arr = text_rank_tokenizer.get_words_as_arr(sentences)
#
# Remove the stop words from the array
#
words_arr = text_rank_tokenizer.remove_stop_words_and_filter_word_arr(words_arr,word_endings, stop_words)
# print(words_arr)
#
# remove empty sentences and lone word sentences and update sentences accordingly
#
sentences, words_arr = text_rank_tokenizer.remove_empty_sentences(sentences, words_arr)
#
# Tokenize the words and sentences into numbers
#
tokens, token_dict = text_rank_tokenizer.tokenize(words_arr)
#
# Create a association matrix
#
association_matrix, counter_vector = text_rank_ranker.create_association_matrix(tokens,No_of_unique_chars= len(token_dict))
#
# Calculate influence of each word on the paragraph
#
word_influence_vector = text_rank_ranker.calculate_word_ranks(association_matrix, counter_vector)
#
# Based in the word importance ranking, calculate teh sentence importance ranking.
#
sentence_influence = text_rank_ranker.calculate_sentence_influence(tokens,word_influence_vector)
#
# Get first n sentences from the given text as summarized text.
#
# print(sentence_influence)
summary_sentences = text_rank_ranker.get_n_influencial_sentence(sentences,sentence_influence,n=np.ceil(len(sentences)*0.33))
summarized_text = text_rank_ranker.get_summarized_text(summary_sentences)
return summarized_text