# NLP Pkgs """ @file spacy_summarizer.py @description Auxiliary module for SpaCy-based text summarization. Contains logic for text processing and sentence ranking. @author Amey Thakur @author Mega Satish @created 2022-08-09 @repository https://github.com/Amey-Thakur/TEXT-SUMMARIZER @license MIT """ import spacy nlp = spacy.load('en') # Pkgs for Normalizing Text from spacy.lang.en.stop_words import STOP_WORDS from string import punctuation # Import Heapq for Finding the Top N Sentences from heapq import nlargest def text_summarizer(raw_docx): raw_text = raw_docx docx = nlp(raw_text) stopwords = list(STOP_WORDS) # Build Word Frequency # word.text is tokenization in spacy word_frequencies = {} for word in docx: if word.text not in stopwords: if word.text not in word_frequencies.keys(): word_frequencies[word.text] = 1 else: word_frequencies[word.text] += 1 maximum_frequncy = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) # Sentence Tokens sentence_list = [ sentence for sentence in docx.sents ] # Sentence Scores sentence_scores = {} for sent in sentence_list: for word in sent: if word.text.lower() in word_frequencies.keys(): if len(sent.text.split(' ')) < 30: if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word.text.lower()] else: sentence_scores[sent] += word_frequencies[word.text.lower()] summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get) final_sentences = [ w.text for w in summarized_sentences ] summary = ' '.join(final_sentences) print("Original Document\n") print(raw_docx) print("Total Length:",len(raw_docx)) print('\n\nSummarized Document\n') print(summary) print("Total Length:",len(summary))