Spaces:

ameythakur
/

text-summarizer

Running

App Files Files Community

text-summarizer / Source Code /spacy_summarization.py

ameythakur

Text Summarizer

7107674 8 days ago

raw

history blame contribute delete

2.92 kB

	# NLP Pkgs
	"""
	@file spacy_summarization.py
	@description Implementation of text summarization logic using the SpaCy library.
	Utilizes tokenization, stop-word removal, and sentence weighting for summary generation.

	@author Amey Thakur <https://github.com/Amey-Thakur>
	@author Mega Satish <https://github.com/msatmod>
	@created 2022-08-09
	@repository https://github.com/Amey-Thakur/TEXT-SUMMARIZER
	@license MIT
	"""

	import spacy
	nlp = spacy.load("en_core_web_sm")
	# Pkgs for Normalizing Text
	from spacy.lang.en.stop_words import STOP_WORDS
	from string import punctuation
	# Import Heapq for Finding the Top N Sentences
	from heapq import nlargest



	def text_summarizer(raw_docx):
	"""
	Generates an extractive summary using SpaCy NLP pipeline.
	Calculates word importance based on inverse frequency and ranks sentences accordingly.

	@param raw_docx (str): The raw input text string to be summarized.
	@return summary (str): The final extractive summary composed of top-ranked sentences.
	"""
	raw_text = raw_docx
	docx = nlp(raw_text) # Process text through SpaCy pipeline
	stopwords = list(STOP_WORDS)

	# 1. Build Word Frequency Distribution
	# Iterate over tokens to count non-stopword occurrences
	word_frequencies = {}
	for word in docx:
	if word.text not in stopwords:
	if word.text not in word_frequencies.keys():
	word_frequencies[word.text] = 1
	else:
	word_frequencies[word.text] += 1

	# 2. Normalize Word Frequencies
	# Scale frequencies to [0, 1] range to determine relative word importance
	maximum_frequncy = max(word_frequencies.values())

	for word in word_frequencies.keys():
	word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)

	# 3. Sentence Tokenization
	# Extract sentence objects from the SpaCy Doc object
	sentence_list = [ sentence for sentence in docx.sents ]

	# 4. Calculate Sentence Scores
	# Aggregate weighted word scores to determine overall sentence significance
	sentence_scores = {}
	for sent in sentence_list:
	for word in sent:
	if word.text.lower() in word_frequencies.keys():
	# Filter out long sentences (>30 words) to maintain summary conciseness
	if len(sent.text.split(' ')) < 30:
	if sent not in sentence_scores.keys():
	sentence_scores[sent] = word_frequencies[word.text.lower()]
	else:
	sentence_scores[sent] += word_frequencies[word.text.lower()]

	# 5. Extract Top Sentences
	# Select the top 7 highest-scoring sentences for the final summary
	summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
	final_sentences = [ w.text for w in summarized_sentences ]
	summary = ' '.join(final_sentences)
	return summary