Upload Sulav TextRank summarization implementation

3061dc8 verified about 21 hours ago

3.17 kB

	import tokenizer as text_rank_tokenizer
	import ranker as text_rank_ranker
	import numpy as np

	stop_words = open("./text_rank/stopwords.txt",'r',encoding="utf-8").read()
	word_endings = open("./text_rank/word_endings.txt",'r',encoding='utf-8').read()
	kriyapads = open("./text_rank/minimal_kriyapad.txt",'r',encoding="utf-8").read().split("\n")
	samyojaks = open("./text_rank/samyojak.txt",'r',encoding="utf-8").read().split("\n")
	valid_chars = "./text_rank/valid_chars.json"

	def get_summary_from_text(text,force_use_purnabiram_model=False):
	global stop_words, word_endings, kriyapads, samyojaks

	is_complete_sentence = True
	# if "।" not in text:
	purnabiram_count = text.count("।")
	if not force_use_purnabiram_model:
	if purnabiram_count*100 < len(text):
	is_complete_sentence = False
	else:
	is_complete_sentence = False
	# print(is_complete_sentence)

	valid_characters = text_rank_tokenizer.get_valid_chars(valid_chars)


	if not is_complete_sentence:
	text = text_rank_tokenizer.add_purnabiram(text,kriyapads,samyojaks)

	#
	# Split the sentence into array of words and patagraph in its array. (as Array of Array of the words)
	#
	sentences = text_rank_tokenizer.get_sentences_as_arr(text)
	# print(sentences)

	text = text_rank_tokenizer.remove_useless_characters(text,valid_characters)


	sentences = text_rank_tokenizer.remove_repeating_sentences(sentences)

	if len(sentences) == 0:
	return "It is not a valid text. Please try again with a valid text."
	elif len(sentences) == 1:
	return sentences

	# print(sentences)
	words_arr = text_rank_tokenizer.get_words_as_arr(sentences)
	#
	# Remove the stop words from the array
	#
	words_arr = text_rank_tokenizer.remove_stop_words_and_filter_word_arr(words_arr,word_endings, stop_words)
	# print(words_arr)

	#
	# remove empty sentences and lone word sentences and update sentences accordingly
	#
	sentences, words_arr = text_rank_tokenizer.remove_empty_sentences(sentences, words_arr)
	#
	# Tokenize the words and sentences into numbers
	#
	tokens, token_dict = text_rank_tokenizer.tokenize(words_arr)
	#
	# Create a association matrix
	#
	association_matrix, counter_vector = text_rank_ranker.create_association_matrix(tokens,No_of_unique_chars= len(token_dict))
	#
	# Calculate influence of each word on the paragraph
	#
	word_influence_vector = text_rank_ranker.calculate_word_ranks(association_matrix, counter_vector)
	#
	# Based in the word importance ranking, calculate teh sentence importance ranking.
	#
	sentence_influence = text_rank_ranker.calculate_sentence_influence(tokens,word_influence_vector)

	#
	# Get first n sentences from the given text as summarized text.
	#

	# print(sentence_influence)
	summary_sentences = text_rank_ranker.get_n_influencial_sentence(sentences,sentence_influence,n=np.ceil(len(sentences)*0.33))


	summarized_text = text_rank_ranker.get_summarized_text(summary_sentences)

	return summarized_text