File size: 2,330 Bytes
7107674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
@file nltk_summarization.py
@description Implementation of text summarization logic using the NLTK library. 
Provides functions to calculate sentence scores based on word frequency.

@author Amey Thakur <https://github.com/Amey-Thakur>
@author Mega Satish <https://github.com/msatmod>
@created 2022-08-09
@repository https://github.com/Amey-Thakur/TEXT-SUMMARIZER
@license MIT
"""

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import heapq  

def nltk_summarizer(raw_text):
	"""
	Generates an extractive text summary using NLTK based on word frequency distribution.
	
	@param raw_text (str): The original text content to be summarized.
	@return summary (str): The concatenated string of the top 7 ranked sentences.
	"""
	stopWords = set(stopwords.words("english"))
	word_frequencies = {}  

	# 1. Calculate Word Frequencies
	# Tokenize the text and count occurrences of non-stop words
	for word in nltk.word_tokenize(raw_text):  
	    if word not in stopWords:
	        if word not in word_frequencies.keys():
	            word_frequencies[word] = 1
	        else:
	            word_frequencies[word] += 1

	# 2. Normalize Frequencies
	# Scale word frequencies by dividing by the maximum frequency to get weighted scores
	maximum_frequncy = max(word_frequencies.values())

	for word in word_frequencies.keys():  
	    word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

	# 3. Calculate Sentence Scores
	# Score sentences by summing the weighted frequencies of their constituent words
	sentence_list = nltk.sent_tokenize(raw_text)
	sentence_scores = {}  
	for sent in sentence_list:  
	    for word in nltk.word_tokenize(sent.lower()):
	        if word in word_frequencies.keys():
	            # Restrict to sentences with fewer than 30 words to avoid excessive length
	            if len(sent.split(' ')) < 30:
	                if sent not in sentence_scores.keys():
	                    sentence_scores[sent] = word_frequencies[word]
	                else:
	                    sentence_scores[sent] += word_frequencies[word]

	# 4. Generate Summary
	# Select the top 7 sentences with the highest cumulative scores
	summary_sentences = heapq.nlargest(7, sentence_scores, key=sentence_scores.get)

	summary = ' '.join(summary_sentences)  
	return summary