Spaces:

segestic
/

ArticlePara

Runtime error

ArticlePara / summarizer.py

olusegun.odewole

first commit

8749106 about 3 years ago

6.23 kB

	import nltk
	nltk.download('punkt')
	nltk.download('stopwords')

	import math

	from nltk import sent_tokenize, word_tokenize, PorterStemmer
	from nltk.corpus import stopwords


	def _create_frequency_table(text_string) -> dict:
	"""
	we create a dictionary for the word frequency table.
	For this, we should only use the words that are not part of the stopWords array.
	Removing stop words and making frequency table
	Stemmer - an algorithm to bring words to its root word.
	:rtype: dict
	"""
	stopWords = set(stopwords.words("english"))
	words = word_tokenize(text_string)
	ps = PorterStemmer()

	freqTable = dict()
	for word in words:
	word = ps.stem(word)
	if word in stopWords:
	continue
	if word in freqTable:
	freqTable[word] += 1
	else:
	freqTable[word] = 1

	return freqTable



	def _create_frequency_matrix(sentences):
	frequency_matrix = {}
	stopWords = set(stopwords.words("english"))
	ps = PorterStemmer()

	for sent in sentences:
	freq_table = {}
	words = word_tokenize(sent)
	for word in words:
	word = word.lower()
	word = ps.stem(word)
	if word in stopWords:
	continue

	if word in freq_table:
	freq_table[word] += 1
	else:
	freq_table[word] = 1

	frequency_matrix[sent[:15]] = freq_table

	return frequency_matrix




	def _create_tf_matrix(freq_matrix):
	tf_matrix = {}

	for sent, f_table in freq_matrix.items():
	tf_table = {}

	count_words_in_sentence = len(f_table)
	for word, count in f_table.items():
	tf_table[word] = count / count_words_in_sentence

	tf_matrix[sent] = tf_table

	return tf_matrix



	def _create_documents_per_words(freq_matrix):
	word_per_doc_table = {}

	for sent, f_table in freq_matrix.items():
	for word, count in f_table.items():
	if word in word_per_doc_table:
	word_per_doc_table[word] += 1
	else:
	word_per_doc_table[word] = 1

	return word_per_doc_table


	def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
	idf_matrix = {}

	for sent, f_table in freq_matrix.items():
	idf_table = {}

	for word in f_table.keys():
	idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

	idf_matrix[sent] = idf_table

	return idf_matrix



	def _create_tf_idf_matrix(tf_matrix, idf_matrix):
	tf_idf_matrix = {}

	for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

	tf_idf_table = {}

	for (word1, value1), (word2, value2) in zip(f_table1.items(),
	f_table2.items()): # here, keys are the same in both the table
	tf_idf_table[word1] = float(value1 * value2)

	tf_idf_matrix[sent1] = tf_idf_table

	return tf_idf_matrix


	def _score_sentences(tf_idf_matrix) -> dict:
	"""
	score a sentence by its word's TF
	Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
	:rtype: dict
	"""

	sentenceValue = {}

	for sent, f_table in tf_idf_matrix.items():
	total_score_per_sentence = 0

	count_words_in_sentence = len(f_table)
	for word, score in f_table.items():
	total_score_per_sentence += score

	sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

	return sentenceValue

	def _find_average_score(sentenceValue) -> int:
	"""
	Find the average score from the sentence value dictionary
	:rtype: int
	"""
	sumValues = 0
	for entry in sentenceValue:
	sumValues += sentenceValue[entry]

	# Average value of a sentence from original summary_text
	average = (sumValues / len(sentenceValue))

	return average


	def _generate_summary(sentences, sentenceValue, threshold):
	sentence_count = 0
	summary = ''

	for sentence in sentences:
	if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
	summary += " " + sentence
	sentence_count += 1

	return summary


	def run_summarization(text):
	"""
	:param text: Plain summary_text of long article
	:return: summarized summary_text
	"""

	'''
	We already have a sentence tokenizer, so we just need
	to run the sent_tokenize() method to create the array of sentences.
	'''
	# 1 Sentence Tokenize
	sentences = sent_tokenize(text)
	total_documents = len(sentences)
	#print(sentences)

	# 2 Create the Frequency matrix of the words in each sentence.
	freq_matrix = _create_frequency_matrix(sentences)
	#print(freq_matrix)

	'''
	Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
	'''
	# 3 Calculate TermFrequency and generate a matrix
	tf_matrix = _create_tf_matrix(freq_matrix)
	#print(tf_matrix)

	# 4 creating table for documents per words
	count_doc_per_words = _create_documents_per_words(freq_matrix)
	#print(count_doc_per_words)

	'''
	Inverse document frequency (IDF) is how unique or rare a word is.
	'''
	# 5 Calculate IDF and generate a matrix
	idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
	#print(idf_matrix)

	# 6 Calculate TF-IDF and generate a matrix
	tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
	#print(tf_idf_matrix)

	# 7 Important Algorithm: score the sentences
	sentence_scores = _score_sentences(tf_idf_matrix)
	#print(sentence_scores)

	# 8 Find the threshold
	threshold = _find_average_score(sentence_scores)
	#print(threshold)

	# 9 Important Algorithm: Generate the summary
	summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
	return summary



	#usage = run_summarization(text_str)










	# def text_summarize(ARTICLE, maxLength, minLength):
	# output = summarizer(ARTICLE)[0]['summary_text']
	# ans = text_paraphrase(output)
	# return ans