File size: 2,917 Bytes
7107674
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# NLP Pkgs
"""
@file spacy_summarization.py
@description Implementation of text summarization logic using the SpaCy library. 
Utilizes tokenization, stop-word removal, and sentence weighting for summary generation.

@author Amey Thakur <https://github.com/Amey-Thakur>
@author Mega Satish <https://github.com/msatmod>
@created 2022-08-09
@repository https://github.com/Amey-Thakur/TEXT-SUMMARIZER
@license MIT
"""

import spacy 
nlp = spacy.load("en_core_web_sm")
# Pkgs for Normalizing Text
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
# Import Heapq for Finding the Top N Sentences
from heapq import nlargest



def text_summarizer(raw_docx):
    """
    Generates an extractive summary using SpaCy NLP pipeline.
    Calculates word importance based on inverse frequency and ranks sentences accordingly.
    
    @param raw_docx (str): The raw input text string to be summarized.
    @return summary (str): The final extractive summary composed of top-ranked sentences.
    """
    raw_text = raw_docx
    docx = nlp(raw_text) # Process text through SpaCy pipeline
    stopwords = list(STOP_WORDS)
    
    # 1. Build Word Frequency Distribution
    # Iterate over tokens to count non-stopword occurrences
    word_frequencies = {}  
    for word in docx:  
        if word.text not in stopwords:
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1

    # 2. Normalize Word Frequencies
    # Scale frequencies to [0, 1] range to determine relative word importance
    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
        
    # 3. Sentence Tokenization
    # Extract sentence objects from the SpaCy Doc object
    sentence_list = [ sentence for sentence in docx.sents ]

    # 4. Calculate Sentence Scores
    # Aggregate weighted word scores to determine overall sentence significance
    sentence_scores = {}  
    for sent in sentence_list:  
        for word in sent:
            if word.text.lower() in word_frequencies.keys():
                # Filter out long sentences (>30 words) to maintain summary conciseness
                if len(sent.text.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word.text.lower()]
                    else:
                        sentence_scores[sent] += word_frequencies[word.text.lower()]

    # 5. Extract Top Sentences
    # Select the top 7 highest-scoring sentences for the final summary
    summarized_sentences = nlargest(7, sentence_scores, key=sentence_scores.get)
    final_sentences = [ w.text for w in summarized_sentences ]
    summary = ' '.join(final_sentences)
    return summary